1
1
// Defines fileno on msys:
2
2
#ifndef _GNU_SOURCE
3
3
#define _GNU_SOURCE
4
+ #include < cstdint>
5
+ #include < cstdio>
4
6
#endif
5
7
6
8
#include " llama_util.h"
@@ -1759,8 +1761,7 @@ int llama_model_quantize(
1759
1761
}
1760
1762
}
1761
1763
1762
- int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, int n_threads) {
1763
- // TODO: refactor all of this after PR #801
1764
+ int llama_apply_lora_from_file_internal (struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1764
1765
fprintf (stderr, " %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
1765
1766
1766
1767
auto & model = ctx->model ;
@@ -1801,13 +1802,13 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1801
1802
1802
1803
// create a temporary ggml context to store the lora tensors
1803
1804
// todo: calculate size from biggest possible tensor
1804
- std::vector<uint8_t > buf (1024ull * 1024ull * 1024ull );
1805
+ std::vector<uint8_t > lora_buf (1024ull * 1024ull * 1024ull );
1805
1806
struct ggml_init_params params;
1806
- params.mem_size = buf .size ();
1807
- params.mem_buffer = buf .data ();
1807
+ params.mem_size = lora_buf .size ();
1808
+ params.mem_buffer = lora_buf .data ();
1808
1809
params.no_alloc = false ;
1809
1810
1810
- ggml_context* lora_ctx = ggml_init (params);
1811
+ ggml_context * lora_ctx = ggml_init (params);
1811
1812
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
1812
1813
1813
1814
// create a name -> tensor map of the model to accelerate lookups
@@ -1816,6 +1817,32 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1816
1817
model_tensors.insert (kv);
1817
1818
}
1818
1819
1820
+
1821
+ // load base model
1822
+ std::unique_ptr<llama_model_loader> model_loader;
1823
+ ggml_context * base_ctx = NULL ;
1824
+ llama_buffer base_buf;
1825
+ if (path_base_model) {
1826
+ fprintf (stderr, " %s: loading base model from '%s'\n " , __func__, path_base_model);
1827
+ model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true , /* vocab_only*/ false ));
1828
+
1829
+ size_t ctx_size, mmapped_size;
1830
+ model_loader->calc_sizes (&ctx_size, &mmapped_size);
1831
+ base_buf.resize (ctx_size);
1832
+
1833
+ ggml_init_params base_params;
1834
+ base_params.mem_size = base_buf.size ;
1835
+ base_params.mem_buffer = base_buf.addr ;
1836
+ base_params.no_alloc = model_loader->use_mmap ;
1837
+
1838
+ base_ctx = ggml_init (base_params);
1839
+
1840
+ model_loader->ggml_ctx = base_ctx;
1841
+
1842
+ // maybe this should in llama_model_loader
1843
+ model_loader->mapping .reset (new llama_mmap (&model_loader->file_loaders .at (0 )->file , false ));
1844
+ }
1845
+
1819
1846
fprintf (stderr, " %s: " , __func__);
1820
1847
1821
1848
// read tensors and apply
@@ -1892,13 +1919,31 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1892
1919
if (lora_tensors.find (base_name + " .loraA" ) != lora_tensors.end () &&
1893
1920
lora_tensors.find (base_name + " .loraB" ) != lora_tensors.end ()) {
1894
1921
1895
- ggml_tensor * tensor = model_tensors[base_name];
1922
+ ggml_tensor * dest_t = model_tensors[base_name];
1923
+ ggml_tensor * base_t ;
1924
+ if (model_loader) {
1925
+ // load from base model
1926
+ if (model_loader->tensors_map .name_to_idx .find (base_name) == model_loader->tensors_map .name_to_idx .end ()) {
1927
+ fprintf (stderr, " %s: error: tensor '%s' not found in base model\n " , __func__, base_name.c_str ());
1928
+ return 1 ;
1929
+ }
1930
+ size_t idx = model_loader->tensors_map .name_to_idx [base_name];
1931
+ llama_load_tensor & lt = model_loader->tensors_map .tensors [idx];
1932
+ base_t = model_loader->get_tensor (base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] });
1933
+ lt.data = (uint8_t *) lt.ggml_tensor ->data ;
1934
+ model_loader->load_data_for (lt);
1935
+ lt.ggml_tensor ->data = lt.data ;
1936
+ }
1937
+ else {
1938
+ base_t = dest_t ;
1939
+ }
1940
+
1896
1941
ggml_tensor * loraA = lora_tensors[base_name + " .loraA" ];
1897
1942
ggml_tensor * loraB = lora_tensors[base_name + " .loraB" ];
1898
1943
1899
- if (tensor ->ne [0 ] != loraA->ne [1 ] || tensor ->ne [1 ] != loraB->ne [1 ]) {
1944
+ if (base_t ->ne [0 ] != loraA->ne [1 ] || base_t ->ne [1 ] != loraB->ne [1 ]) {
1900
1945
fprintf (stderr, " %s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 " );"
1901
- " are you sure that this adapter is for this model?\n " , __func__, tensor ->ne [0 ], loraA->ne [1 ]);
1946
+ " are you sure that this adapter is for this model?\n " , __func__, base_t ->ne [0 ], loraA->ne [1 ]);
1902
1947
return 1 ;
1903
1948
}
1904
1949
@@ -1910,14 +1955,14 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1910
1955
BA = ggml_scale (lora_ctx, BA, scale_tensor);
1911
1956
}
1912
1957
1913
- // printf("%s: (B)(%d %d %d %d) x (A)(%d %d %d %d) => (BA)(%d %d %d %d) + (T)(%d %d %d %d)\n",
1914
- // base_name.c_str(),
1915
- // (int)loraB->ne[0], (int)loraB->ne[1], (int)loraB->ne[2], (int)loraB->ne[3],
1916
- // (int)loraA->ne[0], (int)loraA->ne[1], (int)loraA->ne[2], (int)loraA->ne[3],
1917
- // (int)BA->ne[0], (int)BA->ne[1], (int)BA->ne[2], (int)BA->ne[3],
1918
- // (int)tensor->ne[0], (int)tensor->ne[1], (int)tensor->ne[2], (int)tensor->ne[3]
1919
- // );
1920
- ggml_tensor * r = ggml_add_inplace (lora_ctx, tensor, BA);
1958
+ ggml_tensor * r;
1959
+ if ( base_t == dest_t ) {
1960
+ r = ggml_add_inplace (lora_ctx, dest_t , BA);
1961
+ }
1962
+ else {
1963
+ r = ggml_add (lora_ctx, base_t , BA);
1964
+ r = ggml_cpy (lora_ctx, r, dest_t );
1965
+ }
1921
1966
1922
1967
struct ggml_cgraph gf = ggml_build_forward (r);
1923
1968
gf.n_threads = n_threads;
@@ -1934,14 +1979,27 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
1934
1979
}
1935
1980
}
1936
1981
1982
+ // TODO: this should be in a destructor, it will leak on failure
1937
1983
ggml_free (lora_ctx);
1984
+ if (base_ctx) {
1985
+ ggml_free (base_ctx);
1986
+ }
1938
1987
1939
1988
const int64_t t_lora_us = ggml_time_us () - t_start_lora_us;
1940
1989
fprintf (stderr, " done (%.2f ms)\n " , t_lora_us / 1000.0 );
1941
1990
1942
1991
return 0 ;
1943
1992
}
1944
1993
1994
+ int llama_apply_lora_from_file (struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
1995
+ try {
1996
+ return llama_apply_lora_from_file_internal (ctx, path_lora, path_base_model, n_threads);
1997
+ } catch (const std::string & err) {
1998
+ fprintf (stderr, " %s: failed to apply lora adapter: %s\n " , __func__, err.c_str ());
1999
+ return 1 ;
2000
+ }
2001
+ }
2002
+
1945
2003
// Returns the KV cache that will contain the context for the
1946
2004
// ongoing prediction with the model.
1947
2005
const uint8_t * llama_get_kv_cache (struct llama_context * ctx) {
0 commit comments