Skip to content

Commit

Permalink
Introduce new llamafile server
Browse files Browse the repository at this point in the history
You can now build and run `o//llamafile/server/main` which launches an
HTTP server that currently supports a single endpoint at /tokenize. If
wrk sends it a request to tokenize a string that has 51 tokens then it
serves two million requests per second on my workstation, where 99 pct
latency is 179 µs. This server is designed to be crash proof, reliable
and preeempting. Workers are able to be asynchronously canceled so the
supervisor thread can respawn them. Cosmo's new memory allocator helps
this server be high performance for llama.cpp's STL-heavy use case too
  • Loading branch information
jart committed Jun 5, 2024
1 parent 8b9be96 commit e0656ea
Show file tree
Hide file tree
Showing 39 changed files with 2,902 additions and 121 deletions.
9 changes: 5 additions & 4 deletions build/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘

PREFIX = /usr/local
COSMOCC = .cosmocc/3.3.10
COSMOCC = .cosmocc/3.4.0
TOOLCHAIN = $(COSMOCC)/bin/cosmo

AR = $(TOOLCHAIN)ar
Expand All @@ -13,7 +13,8 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
INSTALL = install

ARFLAGS = rcsD
CCFLAGS = -g -O3 -fexceptions -fsignaling-nans
CXXFLAGS = -frtti -std=gnu++23
CCFLAGS = -g -ggdb -O3 -fexceptions -fsignaling-nans -ffunction-sections -fdata-sections
CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes -DLLAMAFILE_DEBUG
TARGET_ARCH = -Xx86_64-mavx -Xx86_64-mtune=znver4

Expand Down Expand Up @@ -51,5 +52,5 @@ clean:; rm -rf o
.PHONY: distclean
distclean:; rm -rf o .cosmocc

.cosmocc/3.3.10:
build/download-cosmocc.sh $@ 3.3.10 00d61c1215667314f66e288c8285bae38cc6137fca083e5bba6c74e3a52439de
.cosmocc/3.4.0:
build/download-cosmocc.sh $@ 3.4.0 475e24b84a18973312433f5280e267acbe1b4dac1b2e2ebb3cfce46051a8c08c
10 changes: 8 additions & 2 deletions llamafile/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@ LLAMAFILE_SRCS_CPP = $(filter %.cpp,$(LLAMAFILE_FILES))
LLAMAFILE_SRCS = $(LLAMAFILE_SRCS_C) $(LLAMAFILE_SRCS_CPP) $(LLAMAFILE_SRCS_CU)
LLAMAFILE_DOCS = $(filter %.1,$(LLAMAFILE_FILES))

LLAMAFILE_OBJS = \
LLAMAFILE_OBJS := \
$(LLAMAFILE_SRCS_C:%.c=o/$(MODE)/%.o) \
$(LLAMAFILE_SRCS_CPP:%.cpp=o/$(MODE)/%.o) \
$(LLAMAFILE_FILES:%=o/$(MODE)/%.zip.o)

# this executable defines its own malloc(), free(), etc.
# therefore we want to avoid it going inside the .a file
LLAMAFILE_OBJS := $(filter-out o/$(MODE)/llamafile/zipalign.o,$(LLAMAFILE_OBJS))

include llamafile/server/BUILD.mk

o/$(MODE)/llamafile/zipalign: \
o/$(MODE)/llamafile/zipalign.o \
o/$(MODE)/llamafile/help.o \
Expand All @@ -29,7 +35,6 @@ o/$(MODE)/llamafile/zipcheck: \

o/$(MODE)/llamafile/simple: \
o/$(MODE)/llamafile/simple.o \
o/$(MODE)/llama.cpp/llava/llava.a \
o/$(MODE)/llama.cpp/llama.cpp.a

o/$(MODE)/llamafile/tokenize: \
Expand All @@ -39,6 +44,7 @@ o/$(MODE)/llamafile/tokenize: \
.PHONY: o/$(MODE)/llamafile
o/$(MODE)/llamafile: \
$(LLAMAFILE_OBJS) \
o/$(MODE)/llamafile/server \
o/$(MODE)/llamafile/simple \
o/$(MODE)/llamafile/zipalign \
o/$(MODE)/llamafile/zipcheck \
Expand Down
24 changes: 11 additions & 13 deletions llamafile/debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
#include "debug.h"
#include "log.h"

#include <atomic>
#include <cosmo.h>
#include <fenv.h>
#include <libc/calls/struct/aarch64.internal.h>
#include <libc/calls/struct/ucontext.internal.h>
#include <signal.h>
#include <stdatomic.h>
#include <termios.h>
#include <ucontext.h>
#include <unistd.h>
Expand All @@ -36,7 +36,7 @@
#define UNDERFLOW_DELAY 2

bool FLAG_trap;
static atomic_llong g_underflowed;
static std::atomic_llong g_underflowed;
static thread_local int g_enabled;
thread_local int llamafile_debug_op_index;
const struct ggml_cgraph *llamafile_debug_graph;
Expand All @@ -59,17 +59,17 @@ static long long millis(void) {
return timespec_tomillis(timespec_real());
}

static inline void spinlock(atomic_uint *lock) {
static inline void spinlock(std::atomic_uint *lock) {
int x;
for (;;) {
x = atomic_exchange_explicit(lock, 1, memory_order_acquire);
x = lock->exchange(1, std::memory_order_acquire);
if (!x)
break;
}
}

static inline void spunlock(atomic_uint *lock) {
atomic_store_explicit(lock, 0, memory_order_release);
static inline void spunlock(std::atomic_uint *lock) {
lock->store(0, std::memory_order_release);
}

static const char *describe_vertex(struct ggml_tensor *t) {
Expand Down Expand Up @@ -130,16 +130,15 @@ static void on_sigfpe(int sig, siginfo_t *si, void *arg) {
if (reason == FPE_FLTUND) {
if (g_terminal_buddy.is_terminal) {
long long now = millis();
if ((now - atomic_exchange_explicit(&g_underflowed, now, memory_order_relaxed)) >
UNDERFLOW_DELAY) {
if ((now - g_underflowed.exchange(now, std::memory_order_relaxed)) > UNDERFLOW_DELAY) {
write(2, UNDERFLOW_ALARM, strlen(UNDERFLOW_ALARM));
}
}
recover(ctx, FE_UNDERFLOW);
return;
}

static atomic_uint lock;
static std::atomic_uint lock;
spinlock(&lock);

const char *issue;
Expand Down Expand Up @@ -205,7 +204,7 @@ static void setup_sigfpe(void) {
}

int llamafile_trapping_enabled(int delta) {
static atomic_uint once;
static _Atomic(uint32_t) once;
bool was_enabled = g_enabled > 0;
bool is_enabled = (g_enabled += delta) > 0;
feclearexcept(FE_ALL_EXCEPT);
Expand All @@ -225,11 +224,10 @@ void llamafile_trapping_restore(void) {
feenableexcept(TRAPS);
long long last;
if (g_terminal_buddy.is_terminal &&
(last = atomic_load_explicit(&g_underflowed, memory_order_relaxed))) {
(last = g_underflowed.load(std::memory_order_relaxed))) {
long long now = millis();
if (now - last > UNDERFLOW_DELAY &&
now - atomic_exchange_explicit(&g_underflowed, 0, memory_order_relaxed) >
UNDERFLOW_DELAY) {
now - g_underflowed.exchange(0, std::memory_order_relaxed) > UNDERFLOW_DELAY) {
write(2, UNDERFLOW_RESET, strlen(UNDERFLOW_RESET));
}
}
Expand Down
Loading

0 comments on commit e0656ea

Please sign in to comment.