Make gemma2-27b-it the same as aistudio.google.com

llamafile now inferences the Google Gemma2 model in a way that'll output responses that are identical to the aistudio.google.com service. Thus we can now say for certain we're faithfully reproducing the model as Google has intended.
Mozilla-Ocho · Jul 1, 2024 · af22695 · af22695
1 parent 41678c8
commit af22695
Show file tree

Hide file tree

Showing 14 changed files with 35 additions and 314 deletions.
diff --git a/llama.cpp/ggml-vector-amd-avx.c b/llama.cpp/ggml-vector-amd-avx.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx

diff --git a/llama.cpp/ggml-vector-amd-avx2.c b/llama.cpp/ggml-vector-amd-avx2.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx2
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx2
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx2
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx2
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx2
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx2
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx2

diff --git a/llama.cpp/ggml-vector-amd-avx512.c b/llama.cpp/ggml-vector-amd-avx512.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx512
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx512
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx512
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx512
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx512
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx512
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx512

diff --git a/llama.cpp/ggml-vector-amd-avx512bf16.c b/llama.cpp/ggml-vector-amd-avx512bf16.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx512bf16
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx512bf16
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx512bf16
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx512bf16
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx512bf16
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx512bf16
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx512bf16

diff --git a/llama.cpp/ggml-vector-amd-f16c.c b/llama.cpp/ggml-vector-amd-f16c.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_f16c
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_f16c
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_f16c
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_f16c
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_f16c
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_f16c
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_f16c

diff --git a/llama.cpp/ggml-vector-amd-fma.c b/llama.cpp/ggml-vector-amd-fma.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_fma
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_fma
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_fma
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_fma
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_fma
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_fma
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_fma

diff --git a/llama.cpp/ggml-vector-arm80.c b/llama.cpp/ggml-vector-arm80.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_arm80
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_arm80
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_arm80
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_arm80
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_arm80
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_arm80
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_arm80

diff --git a/llama.cpp/ggml-vector-arm82.c b/llama.cpp/ggml-vector-arm82.c
@@ -40,7 +40,6 @@
 #define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_arm82
 #define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_arm82
 #define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_arm82
-#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_arm82
 #define ggml_vec_gelu_f32 ggml_vec_gelu_f32_arm82
 #define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_arm82
 #define ggml_vec_silu_f32 ggml_vec_silu_f32_arm82

diff --git a/llama.cpp/ggml-vector.cpp b/llama.cpp/ggml-vector.cpp
@@ -372,15 +372,6 @@ extern "C" void ggml_vec_hardsigmoid_f32_amd_avx (const int n, float * y, const
 extern "C" void ggml_vec_hardsigmoid_f32_arm82 (const int n, float * y, const float * x);
 extern "C" void ggml_vec_hardsigmoid_f32_arm80 (const int n, float * y, const float * x);
 
-extern "C" void ggml_vec_gelu_f16_amd_avx512bf16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_amd_avx512(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_amd_avx2(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_amd_f16c(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_amd_fma(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_amd_avx(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_arm82(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-extern "C" void ggml_vec_gelu_f16_arm80(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
-
 extern "C" void ggml_vec_gelu_f32_amd_avx512bf16(const int n, float * y, const float * x);
 extern "C" void ggml_vec_gelu_f32_amd_avx512(const int n, float * y, const float * x);
 extern "C" void ggml_vec_gelu_f32_amd_avx2(const int n, float * y, const float * x);
@@ -549,7 +540,6 @@ static const struct VectorFuncs {
     typeof(ggml_vec_leaky_relu_f32) *ptr_ggml_vec_leaky_relu_f32;
     typeof(ggml_vec_hardswish_f32) *ptr_ggml_vec_hardswish_f32;
     typeof(ggml_vec_hardsigmoid_f32) *ptr_ggml_vec_hardsigmoid_f32;
-    typeof(ggml_vec_gelu_f16) *ptr_ggml_vec_gelu_f16;
     typeof(ggml_vec_gelu_f32) *ptr_ggml_vec_gelu_f32;
     typeof(ggml_vec_gelu_quick_f32) *ptr_ggml_vec_gelu_quick_f32;
     typeof(ggml_vec_silu_f32) *ptr_ggml_vec_silu_f32;
@@ -609,7 +599,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx512bf16;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx512bf16;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx512bf16;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx512bf16;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx512bf16;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx512bf16;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx512bf16;
@@ -670,7 +659,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx512;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx512;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx512;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx512;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx512;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx512;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx512;
@@ -731,7 +719,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx2;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx2;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx2;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx2;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx2;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx2;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx2;
@@ -792,7 +779,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_f16c;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_f16c;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_f16c;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_f16c;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_f16c;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_f16c;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_f16c;
@@ -853,7 +839,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_fma;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_fma;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_fma;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_fma;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_fma;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_fma;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_fma;
@@ -914,7 +899,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx;
@@ -975,7 +959,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_arm82;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_arm82;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_arm82;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_arm82;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_arm82;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_arm82;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_arm82;
@@ -1036,7 +1019,6 @@ static const struct VectorFuncs {
             ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_arm80;
             ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_arm80;
             ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_arm80;
-            ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_arm80;
             ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_arm80;
             ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_arm80;
             ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_arm80;
@@ -1221,10 +1203,6 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) {
   return funcs.ptr_ggml_vec_hardsigmoid_f32(n, y, x);
 }
 
-void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-  return funcs.ptr_ggml_vec_gelu_f16(n, y, x);
-}
-
 void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
   return funcs.ptr_ggml_vec_gelu_f32(n, y, x);
 }

diff --git a/llama.cpp/ggml-vector.h b/llama.cpp/ggml-vector.h
@@ -6,8 +6,6 @@ extern "C" {
 
 typedef double ggml_float;
 
-extern float *ggml_table_gelu_f16;
-
 void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n);
@@ -49,7 +47,6 @@ void ggml_vec_relu_f32 (const int n, float * y, const float * x);
 void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns);
 void ggml_vec_hardswish_f32 (const int n, float * y, const float * x);
 void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x);
-void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
 void ggml_vec_gelu_f32(const int n, float * y, const float * x);
 void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x);
 void ggml_vec_silu_f32(const int n, float * y, const float * x);

diff --git a/llama.cpp/ggml-vector.inc b/llama.cpp/ggml-vector.inc
@@ -1329,31 +1329,11 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (i
 //      -7         -6         -5         -4         -3        -2         -1          0          1
 //
 
-inline static float ggml_gelu_f32(float x) {
+static inline float ggml_gelu_f32(float x) {
     // GeLU approximation that goes slower and we seem to be stuck with.
     return .5f * x * (1.f + tanhf(sqrtf(M_2_PI) * (x + .044715f * x * x * x)));
 }
 
-static void ggml_vec_gelu_f16_init(void) {
-    ggml_table_gelu_f16 = malloc(sizeof(float) * 65536);
-    for (int i = 0; i < 65536; ++i) {
-        union {
-            unsigned short i;
-            ggml_fp16_t f;
-        } u = {i};
-        ggml_table_gelu_f16[i] = ggml_gelu_f32(u.f);
-    }
-}
-
-void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    static atomic_uint once;
-    ggml_once(&once, ggml_vec_gelu_f16_init);
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
-    }
-}
-
 void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     for (int i = 0; i < n; ++i) {
         y[i] = ggml_gelu_f32(x[i]);
@@ -1424,32 +1404,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
   const __m512 r = _mm512_set1_ps(0x1.8p23f);
   const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
   const __m512 n = _mm512_sub_ps(z, r);
-  const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
-                                    _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
-  const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
-  const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
-  const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
-  const __m512 u = _mm512_mul_ps(b, b);
-  const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
-                                                                   _mm512_set1_ps(0x1.573e2ep-5f)), u,
-                                                   _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
-                                                                   _mm512_set1_ps(0x1.fffdb6p-2f))),
-                                   u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
-  if (_mm512_kortestz(c, c))
-    return _mm512_fmadd_ps(j, k, k);
-  const __m512i g = _mm512_and_si512(
-      _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
-      _mm512_set1_epi32(0x82000000u));
-  const __m512 s1 =
-      _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
-  const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
+  const __m512 b =
+      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
+                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
   const __mmask16 d =
       _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
-  return _mm512_mask_blend_ps(
-      d, _mm512_mask_blend_ps(
-          c, _mm512_fmadd_ps(k, j, k),
-          _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
-      _mm512_mul_ps(s1, s1));
+  const __m512 u = _mm512_mul_ps(b, b);
+  const __m512 j = _mm512_fmadd_ps(
+      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
+                                      _mm512_set1_ps(0x1.573e2ep-5f)),
+                      u,
+                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
+                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
+      u,
+      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
+  const __m512 res = _mm512_scalef_ps(j, n);
+  if (_mm512_kortestz(d, d))
+    return res;
+  const __m512 zero = _mm512_setzero_ps();
+  const __m512 alt = _mm512_mask_blend_ps(
+      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
+  return _mm512_mask_blend_ps(d, res, alt);
 }
 
 // computes silu x/(1+exp(-x)) in single precision vector