Skip to content

Commit

Permalink
Remove ggml_context cache
Browse files Browse the repository at this point in the history
This appears to be old code intended to elide malloc() calls. Using this
caused reliability issues for the new embedding server, due to its limit
of 64 allocations, which were protected by a GIL.
  • Loading branch information
jart committed Jul 1, 2024
1 parent 3af1ac0 commit 617d841
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 115 deletions.
177 changes: 63 additions & 114 deletions llama.cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1616,12 +1616,6 @@ struct ggml_context {
struct ggml_scratch scratch_save;
};

struct ggml_context_container {
bool used;

struct ggml_context context;
};

struct ggml_compute_state_shared {
const struct ggml_cgraph* cgraph;
const struct ggml_cplan* cplan;
Expand Down Expand Up @@ -1925,7 +1919,6 @@ struct ggml_numa_nodes {
//

struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
};

Expand Down Expand Up @@ -2360,148 +2353,104 @@ static inline int ggml_up(int n, int m) {

////////////////////////////////////////////////////////////////////////////////

struct ggml_context * ggml_init(struct ggml_init_params params) {
// make this function thread safe
ggml_critical_section_start();
static void ggml_init_once(void) {
llamafile_trapping_enabled(-1);

static bool is_first_call = true;

if (is_first_call) {
// initialize time system (required on Windows)
ggml_time_init();

// initialize GELU, Quick GELU, SILU and EXP F32 tables
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

for (int i = 0; i < (1 << 16); ++i) {
union {
uint16_t u16;
ggml_fp16_t fp16;
} u = {i};
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
}
// initialize time system (required on Windows)
ggml_time_init();

const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
// initialize GELU, Quick GELU, SILU and EXP F32 tables
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
for (int i = 0; i < (1 << 16); ++i) {
union {
uint16_t u16;
ggml_fp16_t fp16;
} u = {i};
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
}

// initialize g_state
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

g_state = (struct ggml_state) {
/*.contexts =*/ { { 0 } },
/*.numa =*/ {
.n_nodes = 0,
.total_cpus = 0,
},
};
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
g_state.contexts[i].used = false;
}

const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}

GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
// initialize g_state
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);

#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#endif
g_state = (struct ggml_state) {
/*.numa =*/ {
.n_nodes = 0,
.total_cpus = 0,
},
};

ggml_setup_op_has_task_pass();
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

is_first_call = false;
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}

// find non-used context in g_state
struct ggml_context * ctx = NULL;
#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#endif

for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (!g_state.contexts[i].used) {
g_state.contexts[i].used = true;
ctx = &g_state.contexts[i].context;
ggml_setup_op_has_task_pass();

GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
break;
}
}
llamafile_trapping_enabled(+1);
}

if (ctx == NULL) {
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
struct ggml_context * ggml_init(struct ggml_init_params params) {

llamafile_trapping_enabled(+1);
ggml_critical_section_end();
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, ggml_init_once);

struct ggml_context * ctx;
if (!(ctx = calloc(1, sizeof(struct ggml_context)))) {
GGML_PRINT("%s: failed to allocate ggml_context\n", __func__);
return NULL;
}

// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
params.mem_size = GGML_MEM_ALIGN;
if (params.mem_buffer) {
ctx->mem_size = params.mem_size;
ctx->mem_buffer = params.mem_buffer;
} else {
if (params.mem_size) {
ctx->mem_size = GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
} else {
ctx->mem_size = GGML_MEM_ALIGN;
}
ctx->mem_buffer_owned = true;
ctx->mem_buffer = GGML_ALIGNED_MALLOC(ctx->mem_size);
if (!ctx->mem_buffer) {
GGML_PRINT("%s: failed to allocate %zu bytes for ggml_context->mem_buffer\n",
__func__, ctx->mem_size);
free(ctx);
return NULL;
}
}

const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);

*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.no_alloc =*/ params.no_alloc,
/*.no_alloc_save =*/ params.no_alloc,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
/*.scratch_save =*/ { 0, 0, NULL, },
};

GGML_ASSERT(ctx->mem_buffer != NULL);
ctx->no_alloc = params.no_alloc;
ctx->no_alloc_save = params.no_alloc;

ggml_assert_aligned(ctx->mem_buffer);

GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

ggml_critical_section_end();

return ctx;
}

void ggml_free(struct ggml_context * ctx) {
if (ctx == NULL) {
if (ctx == NULL)
return;
}

// make this function thread safe
ggml_critical_section_start();

bool found = false;

for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false;

GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i, ggml_used_mem(ctx));

if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
}

found = true;
break;
}
}

if (!found) {
GGML_PRINT_DEBUG("%s: context not found\n", __func__);
}

ggml_critical_section_end();
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned)
GGML_ALIGNED_FREE(ctx->mem_buffer);
free(ctx);
}

size_t ggml_used_mem(const struct ggml_context * ctx) {
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,6 @@

#define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 128 // [jart] for stable diffusion
Expand Down

0 comments on commit 617d841

Please sign in to comment.