Skip to content

Commit

Permalink
Create /embedding endpoint in new server
Browse files Browse the repository at this point in the history
    make -j o//llamafile/server/main
    o//llamafile/server/main -m /weights/all-MiniLM-L6-v2.F32.gguf
    curl http://127.0.0.1:8080/embedding?prompt=orange
  • Loading branch information
jart committed Jun 30, 2024
1 parent 46dda4f commit 1346ef4
Show file tree
Hide file tree
Showing 29 changed files with 907 additions and 97 deletions.
14 changes: 10 additions & 4 deletions llama.cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ SOFTWARE.");
#include "llamafile/log.h"
#include "llamafile/debug.h"
#include "llamafile/sgemm.h"
#include "llamafile/thread.h"

#include <alloca.h>
#include <assert.h>
Expand Down Expand Up @@ -1574,7 +1575,7 @@ int ggml_delay(int backoff) {
}
backoff++;
} else {
sched_yield();
pthread_yield_np();
}
return backoff;
}
Expand Down Expand Up @@ -18561,7 +18562,7 @@ typedef int ggml_lock_t;

typedef pthread_t ggml_thread_t;

#define ggml_thread_create pthread_create
#define ggml_thread_create llamafile_thread_create
#define ggml_thread_join pthread_join

#else
Expand Down Expand Up @@ -18975,8 +18976,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
#endif

pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, 0);

#ifdef LLAMAFILE_SYNC_REPORT
g_sync.stamp = rdtsc();
unsigned long old = 0;
Expand All @@ -18986,10 +18985,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
memory_order_relaxed);
#endif

int ct;
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &ct);
pthread_testcancel();

while (true) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
state->shared->node_n += 1;
state->ec = GGML_STATUS_ABORTED;
pthread_setcanceltype(ct, 0);
return 0;
}

Expand Down Expand Up @@ -19126,6 +19130,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}

pthread_setcanceltype(ct,0);

#ifdef LLAMAFILE_SYNC_REPORT
g_sync.work_cycles += rdtsc() - g_sync.stamp;
double total = g_sync.work_cycles + g_sync.wait_cycles;
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
exit(1); \
__builtin_trap(); \
} \
} while (0)

Expand Down
25 changes: 19 additions & 6 deletions llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
#include <unistd.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <ctl/vector.h>

#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((__format__(__gnu_printf__, __VA_ARGS__)))

Expand Down Expand Up @@ -1629,6 +1630,7 @@ struct llama_cparams {
float defrag_thold;

bool embeddings;
bool embeddings_only;
bool causal_attn;
bool offload_kqv;
bool flash_attn;
Expand Down Expand Up @@ -1908,7 +1910,7 @@ struct llama_model {

layer_buft buft_input;
layer_buft buft_output;
std::vector<layer_buft> buft_layer;
ctl::vector<layer_buft> buft_layer;

// contexts where the model tensors metadata is stored
std::vector<struct ggml_context *> ctxs;
Expand Down Expand Up @@ -2240,6 +2242,7 @@ static bool llama_kv_cache_init(
}

// allocate tensors and initialize the buffers to avoid NaNs in the padding
// BOOP 2709 us
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
Expand All @@ -2248,7 +2251,8 @@ static bool llama_kv_cache_init(
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
if (!cparams.embeddings_only)
ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
cache.bufs.push_back(buf);
}
Expand Down Expand Up @@ -4611,13 +4615,15 @@ static bool llm_load_tensors(
}
#endif

n_gpu_layers = std::min(n_gpu_layers, (int)hparams.n_layer); // [jart]
n_gpu_layers = std::max(n_gpu_layers, 0); // [jart]

model.split_mode = split_mode;
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;

const int64_t n_layer = hparams.n_layer;
const int n_gpu = std::min(n_gpu_layers, int(n_layer)); // [jart] prevent vector overflow
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu, (int64_t) 0);
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
bool use_mmap_buffer = true;

// there is very little benefit to offloading the input layer, so always keep it on the CPU
Expand Down Expand Up @@ -11122,7 +11128,7 @@ static int llama_decode_internal(
const uint32_t n_tokens_all = batch_all.n_tokens;

if (n_tokens_all == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1;
}

Expand Down Expand Up @@ -15231,6 +15237,7 @@ struct llama_context_params llama_context_default_params() {
/*.type_v =*/ GGML_TYPE_F16,
/*.logits_all =*/ false,
/*.embeddings =*/ false,
/*.embeddings_only =*/ false, // [jart]
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.abort_callback =*/ nullptr,
Expand Down Expand Up @@ -15408,6 +15415,8 @@ struct llama_context * llama_new_context_with_model(
cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.pooling_type = params.pooling_type;
cparams.embeddings = params.embeddings;
cparams.embeddings_only = params.embeddings_only; // [jart]

cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
Expand Down Expand Up @@ -15675,6 +15684,7 @@ if (llamafile_has_metal()) {
pipeline_parallel = false;
}
// #endif
// BOOP 1611 us ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel)
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);

if (pipeline_parallel) {
Expand All @@ -15688,6 +15698,7 @@ if (llamafile_has_metal()) {
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);

// initialize scheduler with the worst-case graph
// BOOP 298 us ggml_backend_sched_reserve(ctx->sched, gf)
if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
llama_free(ctx);
Expand Down Expand Up @@ -16112,6 +16123,8 @@ int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
}

void llama_kv_cache_clear(struct llama_context * ctx) {
if (ctx->cparams.embeddings_only) // [jart]
return;
llama_kv_cache_clear(ctx->kv_self);
}

Expand Down Expand Up @@ -17316,7 +17329,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {

// [jart] DO NOT SYNC this function
static float * llama_get_embeddings_ith_fail(int i, std::string reason) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, reason);
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, reason.c_str());
return nullptr;
}

Expand Down
1 change: 1 addition & 0 deletions llama.cpp/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embeddings; // if true, extract embeddings (together with logits)
bool embeddings_only; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention

Expand Down
9 changes: 8 additions & 1 deletion llamafile/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ LLAMAFILE_OBJS := \
# this executable defines its own malloc(), free(), etc.
# therefore we want to avoid it going inside the .a file
LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/zipalign.o,$(LLAMAFILE_OBJS))
LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/thread_test.o,$(LLAMAFILE_OBJS))

include llamafile/server/BUILD.mk

Expand Down Expand Up @@ -49,7 +50,7 @@ o/$(MODE)/llamafile: \
o/$(MODE)/llamafile/zipalign \
o/$(MODE)/llamafile/zipcheck \
o/$(MODE)/llamafile/tokenize \
o/$(MODE)/llamafile/addnl
o/$(MODE)/llamafile/addnl \

################################################################################
# microarchitectures
Expand Down Expand Up @@ -116,6 +117,12 @@ o/$(MODE)/llamafile/tinyblas_cpu_mixmul_arm82.o: private TARGET_ARCH += -Xaarch6
################################################################################
# testing

o/$(MODE)/llamafile/thread_test: \
o/$(MODE)/llamafile/thread_test.o \
o/$(MODE)/llamafile/thread.o \
o/$(MODE)/llamafile/crash.o \
o/$(MODE)/llamafile/dll3.o \

o/$(MODE)/llamafile/sgemm_sss_test: private LDFLAGS += -fopenmp
o/$(MODE)/llamafile/sgemm_sss_test.o: private CCFLAGS += -fopenmp
o/$(MODE)/llamafile/sgemm_matmul_test: private LDFLAGS += -fopenmp
Expand Down
48 changes: 26 additions & 22 deletions llamafile/server/crash.cpp → llamafile/crash.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
Expand All @@ -15,8 +15,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "signals.h"
#include "utils.h"
#include "crash.h"

#include <cosmo.h>
#include <ucontext.h>
Expand All @@ -29,11 +28,18 @@
#define BP gregs[REG_RBP]
#endif

char *hexcpy(char *p, unsigned long x) {
int k = x ? (__builtin_clzl(x) ^ 63) + 1 : 1;
k = (k + 3) & -4;
while (k > 0)
*p++ = "0123456789abcdef"[(x >> (k -= 4)) & 15];
*p = '\0';
return p;
}

// returns true if `p` is preceded by x86 call instruction
// this is actually impossible to do but we'll do our best
int
is_call(const unsigned char* p)
{
int is_call(const unsigned char *p) {
if (p[-5] == 0xe8)
return 5; // call Jvds
if (p[-2] == 0xff && (p[-1] & 070) == 020)
Expand All @@ -49,10 +55,10 @@ is_call(const unsigned char* p)
return 0;
}

char*
describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
{
char* p = buf;
// abashed the devil stood
// and felt how awful goodness is
char *describe_crash(char *buf, size_t len, int sig, siginfo_t *si, void *arg) {
char *p = buf;

// check minimum length
if (len < 64)
Expand All @@ -72,15 +78,15 @@ describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
}

// get stack frame daisy chain
StackFrame pc;
StackFrame* sf;
ucontext_t* ctx;
if ((ctx = (ucontext_t*)arg)) {
struct StackFrame pc;
struct StackFrame *sf;
ucontext_t *ctx;
if ((ctx = (ucontext_t *)arg)) {
pc.addr = ctx->uc_mcontext.PC;
pc.next = (struct StackFrame*)ctx->uc_mcontext.BP;
pc.next = (struct StackFrame *)ctx->uc_mcontext.BP;
sf = &pc;
} else {
sf = (struct StackFrame*)__builtin_frame_address(0);
sf = (struct StackFrame *)__builtin_frame_address(0);
}

// describe backtrace
Expand All @@ -90,10 +96,8 @@ describe_crash(char* buf, size_t len, int sig, siginfo_t* si, void* arg)
return p;
}

char*
describe_backtrace(char* p, size_t len, const StackFrame* sf)
{
char* pe = p + len;
char *describe_backtrace(char *p, size_t len, const struct StackFrame *sf) {
char *pe = p + len;
bool gotsome = false;

// show address of each function
Expand All @@ -111,7 +115,7 @@ describe_backtrace(char* p, size_t len, const StackFrame* sf)
break;
}
if (p + 16 + 1 < pe) {
unsigned char* ip = (unsigned char*)sf->addr;
unsigned char *ip = (unsigned char *)sf->addr;
#ifdef __x86_64__
// x86 advances the progrem counter before an instruction
// begins executing. return addresses in backtraces shall
Expand Down
16 changes: 16 additions & 0 deletions llamafile/crash.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef LLAMAFILE_CRASH_H_
#define LLAMAFILE_CRASH_H_
#include <signal.h>
#ifdef __cplusplus
extern "C" {
#endif

struct StackFrame;

char *describe_crash(char *, size_t, int, siginfo_t *, void *);
char *describe_backtrace(char *, size_t, const struct StackFrame *);

#ifdef __cplusplus
}
#endif
#endif /* LLAMAFILE_CRASH_H_ */
Loading

0 comments on commit 1346ef4

Please sign in to comment.