Create /embedding endpoint in new server

make -j o//llamafile/server/main o//llamafile/server/main -m /weights/all-MiniLM-L6-v2.F32.gguf curl http://127.0.0.1:8080/embedding?prompt=orange
Mozilla-Ocho · Jun 30, 2024 · 1346ef4 · 1346ef4
1 parent 46dda4f
commit 1346ef4
Show file tree

Hide file tree

Showing 29 changed files with 907 additions and 97 deletions.
diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
@@ -41,6 +41,7 @@ SOFTWARE.");
 #include "llamafile/log.h"
 #include "llamafile/debug.h"
 #include "llamafile/sgemm.h"
+#include "llamafile/thread.h"
 
 #include <alloca.h>
 #include <assert.h>
@@ -1574,7 +1575,7 @@ int ggml_delay(int backoff) {
         }
         backoff++;
     } else {
-        sched_yield();
+        pthread_yield_np();
     }
     return backoff;
 }
@@ -18561,7 +18562,7 @@ typedef int ggml_lock_t;
 
 typedef pthread_t ggml_thread_t;
 
-#define ggml_thread_create pthread_create
+#define ggml_thread_create llamafile_thread_create
 #define ggml_thread_join   pthread_join
 
 #else
@@ -18975,8 +18976,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     }
 #endif
 
-    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0);
-
 #ifdef LLAMAFILE_SYNC_REPORT
     g_sync.stamp = rdtsc();
     unsigned long old = 0;
@@ -18986,10 +18985,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                                             memory_order_relaxed);
 #endif
 
+    int ct;
+    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &ct);
+    pthread_testcancel();
+
     while (true) {
         if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
             state->shared->node_n += 1;
             state->ec = GGML_STATUS_ABORTED;
+            pthread_setcanceltype(ct, 0);
             return 0;
         }
 
@@ -19126,6 +19130,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
     }
 
+    pthread_setcanceltype(ct,0);
+
 #ifdef LLAMAFILE_SYNC_REPORT
     g_sync.work_cycles += rdtsc() - g_sync.stamp;
     double total = g_sync.work_cycles + g_sync.wait_cycles;

diff --git a/llama.cpp/ggml.h b/llama.cpp/ggml.h
@@ -262,7 +262,7 @@
         if (!(x)) { \
             fflush(stdout); \
             fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            exit(1); \
+            __builtin_trap(); \
         } \
     } while (0)
 

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -56,6 +56,7 @@
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/resource.h>
+#include <ctl/vector.h>
 
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((__format__(__gnu_printf__, __VA_ARGS__)))
 
@@ -1629,6 +1630,7 @@ struct llama_cparams {
     float defrag_thold;
 
     bool embeddings;
+    bool embeddings_only;
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
@@ -1908,7 +1910,7 @@ struct llama_model {
 
     layer_buft buft_input;
     layer_buft buft_output;
-    std::vector<layer_buft> buft_layer;
+    ctl::vector<layer_buft> buft_layer;
 
     // contexts where the model tensors metadata is stored
     std::vector<struct ggml_context *> ctxs;
@@ -2240,6 +2242,7 @@ static bool llama_kv_cache_init(
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    // BOOP 2709 us
     for (auto it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx = it.second;
@@ -2248,7 +2251,8 @@ static bool llama_kv_cache_init(
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
             return false;
         }
-        ggml_backend_buffer_clear(buf, 0);
+        if (!cparams.embeddings_only)
+            ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         cache.bufs.push_back(buf);
     }
@@ -4611,13 +4615,15 @@ static bool llm_load_tensors(
     }
 #endif
 
+    n_gpu_layers = std::min(n_gpu_layers, (int)hparams.n_layer); // [jart]
+    n_gpu_layers = std::max(n_gpu_layers, 0); // [jart]
+
     model.split_mode   = split_mode;
     model.main_gpu     = main_gpu;
     model.n_gpu_layers = n_gpu_layers;
 
     const int64_t n_layer     = hparams.n_layer;
-    const int     n_gpu       = std::min(n_gpu_layers, int(n_layer)); // [jart] prevent vector overflow
-    const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu, (int64_t) 0);
+    const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
     bool use_mmap_buffer = true;
 
     // there is very little benefit to offloading the input layer, so always keep it on the CPU
@@ -11122,7 +11128,7 @@ static int llama_decode_internal(
     const uint32_t n_tokens_all = batch_all.n_tokens;
 
     if (n_tokens_all == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
         return -1;
     }
 
@@ -15231,6 +15237,7 @@ struct llama_context_params llama_context_default_params() {
         /*.type_v                      =*/ GGML_TYPE_F16,
         /*.logits_all                  =*/ false,
         /*.embeddings                  =*/ false,
+        /*.embeddings_only             =*/ false, // [jart]
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
         /*.abort_callback              =*/ nullptr,
@@ -15408,6 +15415,8 @@ struct llama_context * llama_new_context_with_model(
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
     cparams.pooling_type     = params.pooling_type;
+    cparams.embeddings       = params.embeddings;
+    cparams.embeddings_only  = params.embeddings_only; // [jart]
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@@ -15675,6 +15684,7 @@ if (llamafile_has_metal()) {
             pipeline_parallel = false;
             }
 // #endif
+            // BOOP 1611 us ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel)
             ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
 
             if (pipeline_parallel) {
@@ -15688,6 +15698,7 @@ if (llamafile_has_metal()) {
             ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
 
             // initialize scheduler with the worst-case graph
+            // BOOP 298 us ggml_backend_sched_reserve(ctx->sched, gf)
             if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 llama_free(ctx);
@@ -16112,6 +16123,8 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
 }
 
 void llama_kv_cache_clear(struct llama_context * ctx) {
+    if (ctx->cparams.embeddings_only) // [jart]
+        return;
     llama_kv_cache_clear(ctx->kv_self);
 }
 
@@ -17316,7 +17329,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
 
 // [jart] DO NOT SYNC this function
 static float * llama_get_embeddings_ith_fail(int i, std::string reason) {
-    LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, reason);
+    LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, reason.c_str());
     return nullptr;
 }
 

diff --git a/llama.cpp/llama.h b/llama.cpp/llama.h
@@ -298,6 +298,7 @@ extern "C" {
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embeddings;  // if true, extract embeddings (together with logits)
+        bool embeddings_only;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention
 

diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
@@ -20,6 +20,7 @@ LLAMAFILE_OBJS :=					\
 # this executable defines its own malloc(), free(), etc.
 # therefore we want to avoid it going inside the .a file
 LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/zipalign.o,$(LLAMAFILE_OBJS))
+LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/thread_test.o,$(LLAMAFILE_OBJS))
 
 include llamafile/server/BUILD.mk
 
@@ -49,7 +50,7 @@ o/$(MODE)/llamafile:					\
 		o/$(MODE)/llamafile/zipalign		\
 		o/$(MODE)/llamafile/zipcheck		\
 		o/$(MODE)/llamafile/tokenize		\
-		o/$(MODE)/llamafile/addnl
+		o/$(MODE)/llamafile/addnl		\
 
 ################################################################################
 # microarchitectures
@@ -116,6 +117,12 @@ o/$(MODE)/llamafile/tinyblas_cpu_mixmul_arm82.o: private TARGET_ARCH += -Xaarch6
 ################################################################################
 # testing
 
+o/$(MODE)/llamafile/thread_test:			\
+		o/$(MODE)/llamafile/thread_test.o	\
+		o/$(MODE)/llamafile/thread.o		\
+		o/$(MODE)/llamafile/crash.o		\
+		o/$(MODE)/llamafile/dll3.o		\
+
 o/$(MODE)/llamafile/sgemm_sss_test: private LDFLAGS += -fopenmp
 o/$(MODE)/llamafile/sgemm_sss_test.o: private CCFLAGS += -fopenmp
 o/$(MODE)/llamafile/sgemm_matmul_test: private LDFLAGS += -fopenmp

diff --git a/llamafile/server/crash.cpp → llamafile/crash.c b/llamafile/server/crash.cpp → llamafile/crash.c
@@ -1,5 +1,5 @@
-// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
-// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
 //
 // Copyright 2024 Mozilla Foundation
 //
@@ -15,8 +15,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "signals.h"
-#include "utils.h"
+#include "crash.h"
 
 #include <cosmo.h>
 #include <ucontext.h>
@@ -29,11 +28,18 @@
 #define BP gregs[REG_RBP]
 #endif
 
+char *hexcpy(char *p, unsigned long x) {
+    int k = x ? (__builtin_clzl(x) ^ 63) + 1 : 1;
+    k = (k + 3) & -4;
+    while (k > 0)
+        *p++ = "0123456789abcdef"[(x >> (k -= 4)) & 15];
+    *p = '\0';
+    return p;
+}
+
 // returns true if `p` is preceded by x86 call instruction
 // this is actually impossible to do but we'll do our best
-int
-is_call(const unsigned char* p)
-{
+int is_call(const unsigned char *p) {
     if (p[-5] == 0xe8)
         return 5; // call Jvds
     if (p[-2] == 0xff && (p[-1] & 070) == 020)
@@ -49,10 +55,10 @@ is_call(const unsigned char* p)
     return 0;
 }
 
-char*
-describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
-{
-    char* p = buf;
+//                         abashed the devil stood
+//                      and felt how awful goodness is
+char *describe_crash(char *buf, size_t len, int sig, siginfo_t *si, void *arg) {
+    char *p = buf;
 
     // check minimum length
     if (len < 64)
@@ -72,15 +78,15 @@ describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
     }
 
     // get stack frame daisy chain
-    StackFrame pc;
-    StackFrame* sf;
-    ucontext_t* ctx;
-    if ((ctx = (ucontext_t*)arg)) {
+    struct StackFrame pc;
+    struct StackFrame *sf;
+    ucontext_t *ctx;
+    if ((ctx = (ucontext_t *)arg)) {
         pc.addr = ctx->uc_mcontext.PC;
-        pc.next = (struct StackFrame*)ctx->uc_mcontext.BP;
+        pc.next = (struct StackFrame *)ctx->uc_mcontext.BP;
         sf = &pc;
     } else {
-        sf = (struct StackFrame*)__builtin_frame_address(0);
+        sf = (struct StackFrame *)__builtin_frame_address(0);
     }
 
     // describe backtrace
@@ -90,10 +96,8 @@ describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
     return p;
 }
 
-char*
-describe_backtrace(char* p, size_t len, const StackFrame* sf)
-{
-    char* pe = p + len;
+char *describe_backtrace(char *p, size_t len, const struct StackFrame *sf) {
+    char *pe = p + len;
     bool gotsome = false;
 
     // show address of each function
@@ -111,7 +115,7 @@ describe_backtrace(char* p, size_t len, const StackFrame* sf)
             break;
         }
         if (p + 16 + 1 < pe) {
-            unsigned char* ip = (unsigned char*)sf->addr;
+            unsigned char *ip = (unsigned char *)sf->addr;
 #ifdef __x86_64__
             // x86 advances the progrem counter before an instruction
             // begins executing. return addresses in backtraces shall

diff --git a/llamafile/crash.h b/llamafile/crash.h
@@ -0,0 +1,16 @@
+#ifndef LLAMAFILE_CRASH_H_
+#define LLAMAFILE_CRASH_H_
+#include <signal.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct StackFrame;
+
+char *describe_crash(char *, size_t, int, siginfo_t *, void *);
+char *describe_backtrace(char *, size_t, const struct StackFrame *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* LLAMAFILE_CRASH_H_ */