Remove ggml_context cache

This appears to be old code intended to elide malloc() calls. Using this caused reliability issues for the new embedding server, due to its limit of 64 allocations, which were protected by a GIL.
Mozilla-Ocho · Jul 1, 2024 · 617d841 · 617d841
1 parent 3af1ac0
commit 617d841
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 115 deletions.
diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
@@ -1616,12 +1616,6 @@ struct ggml_context {
     struct ggml_scratch scratch_save;
 };
 
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
 struct ggml_compute_state_shared {
     const struct ggml_cgraph* cgraph;
     const struct ggml_cplan* cplan;
@@ -1925,7 +1919,6 @@ struct ggml_numa_nodes {
 //
 
 struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
     struct ggml_numa_nodes numa;
 };
 
@@ -2360,148 +2353,104 @@ static inline int ggml_up(int n, int m) {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
+static void ggml_init_once(void) {
     llamafile_trapping_enabled(-1);
 
     static bool is_first_call = true;
 
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
-
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            for (int i = 0; i < (1 << 16); ++i) {
-                union {
-                    uint16_t u16;
-                    ggml_fp16_t fp16;
-                } u = {i};
-                float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
-            }
+    // initialize time system (required on Windows)
+    ggml_time_init();
 
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+    // initialize GELU, Quick GELU, SILU and EXP F32 tables
+    {
+        const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
-            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+        for (int i = 0; i < (1 << 16); ++i) {
+            union {
+                uint16_t u16;
+                ggml_fp16_t fp16;
+            } u = {i};
+            float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
         }
 
-        // initialize g_state
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-                /*.numa =*/ {
-                    .n_nodes = 0,
-                    .total_cpus = 0,
-                },
-            };
+        const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
-            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
-                g_state.contexts[i].used = false;
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+        GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
+    }
 
-            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
+    // initialize g_state
+    {
+        const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
 
-#if defined(GGML_USE_CLBLAST)
-        ggml_cl_init();
-#endif
+        g_state = (struct ggml_state) {
+            /*.numa =*/ {
+                .n_nodes = 0,
+                .total_cpus = 0,
+            },
+        };
 
-        ggml_setup_op_has_task_pass();
+        const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
-        is_first_call = false;
+        GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
     }
 
-    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
+#if defined(GGML_USE_CLBLAST)
+    ggml_cl_init();
+#endif
 
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (!g_state.contexts[i].used) {
-            g_state.contexts[i].used = true;
-            ctx = &g_state.contexts[i].context;
+    ggml_setup_op_has_task_pass();
 
-            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
-            break;
-        }
-    }
+    llamafile_trapping_enabled(+1);
+}
 
-    if (ctx == NULL) {
-        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
+struct ggml_context * ggml_init(struct ggml_init_params params) {
 
-        llamafile_trapping_enabled(+1);
-        ggml_critical_section_end();
+    static pthread_once_t once = PTHREAD_ONCE_INIT;
+    pthread_once(&once, ggml_init_once);
 
+    struct ggml_context * ctx;
+    if (!(ctx = calloc(1, sizeof(struct ggml_context)))) {
+        GGML_PRINT("%s: failed to allocate ggml_context\n", __func__);
         return NULL;
     }
 
-    // allow to call ggml_init with 0 size
-    if (params.mem_size == 0) {
-        params.mem_size = GGML_MEM_ALIGN;
+    if (params.mem_buffer) {
+        ctx->mem_size = params.mem_size;
+        ctx->mem_buffer = params.mem_buffer;
+    } else {
+        if (params.mem_size) {
+            ctx->mem_size = GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
+        } else {
+            ctx->mem_size = GGML_MEM_ALIGN;
+        }
+        ctx->mem_buffer_owned = true;
+        ctx->mem_buffer = GGML_ALIGNED_MALLOC(ctx->mem_size);
+        if (!ctx->mem_buffer) {
+            GGML_PRINT("%s: failed to allocate %zu bytes for ggml_context->mem_buffer\n",
+                       __func__, ctx->mem_size);
+            free(ctx);
+            return NULL;
+        }
     }
 
-    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
-
-    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.no_alloc           =*/ params.no_alloc,
-        /*.no_alloc_save      =*/ params.no_alloc,
-        /*.n_objects          =*/ 0,
-        /*.objects_begin      =*/ NULL,
-        /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
-    };
-
-    GGML_ASSERT(ctx->mem_buffer != NULL);
+    ctx->no_alloc = params.no_alloc;
+    ctx->no_alloc_save = params.no_alloc;
 
     ggml_assert_aligned(ctx->mem_buffer);
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
-    ggml_critical_section_end();
-
     return ctx;
 }
 
 void ggml_free(struct ggml_context * ctx) {
-    if (ctx == NULL) {
+    if (ctx == NULL)
         return;
-    }
-
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    bool found = false;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (&g_state.contexts[i].context == ctx) {
-            g_state.contexts[i].used = false;
-
-            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ggml_used_mem(ctx));
-
-            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
-            }
-
-            found = true;
-            break;
-        }
-    }
-
-    if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
-    }
-
-    ggml_critical_section_end();
+    GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+                     __func__, i, ggml_used_mem(ctx));
+    if (ctx->mem_buffer_owned)
+        GGML_ALIGNED_FREE(ctx->mem_buffer);
+    free(ctx);
 }
 
 size_t ggml_used_mem(const struct ggml_context * ctx) {

diff --git a/llama.cpp/ggml.h b/llama.cpp/ggml.h
@@ -230,7 +230,6 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           128 // [jart] for stable diffusion