Skip to content

Commit

Permalink
Make gemma2-27b-it the same as aistudio.google.com
Browse files Browse the repository at this point in the history
llamafile now inferences the Google Gemma2 model in a way that'll output
responses that are identical to the aistudio.google.com service. Thus we
can now say for certain we're faithfully reproducing the model as Google
has intended.
  • Loading branch information
jart committed Jul 1, 2024
1 parent 41678c8 commit af22695
Show file tree
Hide file tree
Showing 14 changed files with 35 additions and 314 deletions.
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-amd-avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-amd-avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx2
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx2
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx2
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx2
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx2
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx2
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx2
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-amd-avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx512
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx512
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx512
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx512
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx512
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx512
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx512
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-amd-avx512bf16.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx512bf16
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx512bf16
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx512bf16
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx512bf16
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx512bf16
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx512bf16
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx512bf16
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-amd-f16c.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_f16c
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_f16c
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_f16c
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_f16c
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_f16c
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_f16c
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_f16c
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-amd-fma.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_fma
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_fma
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_fma
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_fma
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_fma
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_fma
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_fma
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-arm80.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_arm80
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_arm80
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_arm80
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_arm80
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_arm80
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_arm80
#define ggml_vec_silu_f32 ggml_vec_silu_f32_arm80
Expand Down
1 change: 0 additions & 1 deletion llama.cpp/ggml-vector-arm82.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_arm82
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_arm82
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_arm82
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_arm82
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_arm82
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_arm82
#define ggml_vec_silu_f32 ggml_vec_silu_f32_arm82
Expand Down
22 changes: 0 additions & 22 deletions llama.cpp/ggml-vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,15 +372,6 @@ extern "C" void ggml_vec_hardsigmoid_f32_amd_avx (const int n, float * y, const
extern "C" void ggml_vec_hardsigmoid_f32_arm82 (const int n, float * y, const float * x);
extern "C" void ggml_vec_hardsigmoid_f32_arm80 (const int n, float * y, const float * x);

extern "C" void ggml_vec_gelu_f16_amd_avx512bf16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_amd_avx512(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_amd_avx2(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_amd_f16c(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_amd_fma(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_amd_avx(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_arm82(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
extern "C" void ggml_vec_gelu_f16_arm80(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);

extern "C" void ggml_vec_gelu_f32_amd_avx512bf16(const int n, float * y, const float * x);
extern "C" void ggml_vec_gelu_f32_amd_avx512(const int n, float * y, const float * x);
extern "C" void ggml_vec_gelu_f32_amd_avx2(const int n, float * y, const float * x);
Expand Down Expand Up @@ -549,7 +540,6 @@ static const struct VectorFuncs {
typeof(ggml_vec_leaky_relu_f32) *ptr_ggml_vec_leaky_relu_f32;
typeof(ggml_vec_hardswish_f32) *ptr_ggml_vec_hardswish_f32;
typeof(ggml_vec_hardsigmoid_f32) *ptr_ggml_vec_hardsigmoid_f32;
typeof(ggml_vec_gelu_f16) *ptr_ggml_vec_gelu_f16;
typeof(ggml_vec_gelu_f32) *ptr_ggml_vec_gelu_f32;
typeof(ggml_vec_gelu_quick_f32) *ptr_ggml_vec_gelu_quick_f32;
typeof(ggml_vec_silu_f32) *ptr_ggml_vec_silu_f32;
Expand Down Expand Up @@ -609,7 +599,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx512bf16;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx512bf16;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx512bf16;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx512bf16;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx512bf16;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx512bf16;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx512bf16;
Expand Down Expand Up @@ -670,7 +659,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx512;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx512;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx512;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx512;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx512;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx512;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx512;
Expand Down Expand Up @@ -731,7 +719,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx2;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx2;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx2;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx2;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx2;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx2;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx2;
Expand Down Expand Up @@ -792,7 +779,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_f16c;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_f16c;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_f16c;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_f16c;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_f16c;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_f16c;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_f16c;
Expand Down Expand Up @@ -853,7 +839,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_fma;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_fma;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_fma;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_fma;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_fma;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_fma;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_fma;
Expand Down Expand Up @@ -914,7 +899,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_amd_avx;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_amd_avx;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_amd_avx;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_amd_avx;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_amd_avx;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_amd_avx;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_amd_avx;
Expand Down Expand Up @@ -975,7 +959,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_arm82;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_arm82;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_arm82;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_arm82;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_arm82;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_arm82;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_arm82;
Expand Down Expand Up @@ -1036,7 +1019,6 @@ static const struct VectorFuncs {
ptr_ggml_vec_leaky_relu_f32 = ggml_vec_leaky_relu_f32_arm80;
ptr_ggml_vec_hardswish_f32 = ggml_vec_hardswish_f32_arm80;
ptr_ggml_vec_hardsigmoid_f32 = ggml_vec_hardsigmoid_f32_arm80;
ptr_ggml_vec_gelu_f16 = ggml_vec_gelu_f16_arm80;
ptr_ggml_vec_gelu_f32 = ggml_vec_gelu_f32_arm80;
ptr_ggml_vec_gelu_quick_f32 = ggml_vec_gelu_quick_f32_arm80;
ptr_ggml_vec_silu_f32 = ggml_vec_silu_f32_arm80;
Expand Down Expand Up @@ -1221,10 +1203,6 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) {
return funcs.ptr_ggml_vec_hardsigmoid_f32(n, y, x);
}

void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
return funcs.ptr_ggml_vec_gelu_f16(n, y, x);
}

void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
return funcs.ptr_ggml_vec_gelu_f32(n, y, x);
}
Expand Down
3 changes: 0 additions & 3 deletions llama.cpp/ggml-vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ extern "C" {

typedef double ggml_float;

extern float *ggml_table_gelu_f16;

void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n);
Expand Down Expand Up @@ -49,7 +47,6 @@ void ggml_vec_relu_f32 (const int n, float * y, const float * x);
void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns);
void ggml_vec_hardswish_f32 (const int n, float * y, const float * x);
void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x);
void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x);
void ggml_vec_gelu_f32(const int n, float * y, const float * x);
void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x);
void ggml_vec_silu_f32(const int n, float * y, const float * x);
Expand Down
65 changes: 20 additions & 45 deletions llama.cpp/ggml-vector.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1329,31 +1329,11 @@ void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (i
// -7 -6 -5 -4 -3 -2 -1 0 1
//

inline static float ggml_gelu_f32(float x) {
static inline float ggml_gelu_f32(float x) {
// GeLU approximation that goes slower and we seem to be stuck with.
return .5f * x * (1.f + tanhf(sqrtf(M_2_PI) * (x + .044715f * x * x * x)));
}

static void ggml_vec_gelu_f16_init(void) {
ggml_table_gelu_f16 = malloc(sizeof(float) * 65536);
for (int i = 0; i < 65536; ++i) {
union {
unsigned short i;
ggml_fp16_t f;
} u = {i};
ggml_table_gelu_f16[i] = ggml_gelu_f32(u.f);
}
}

void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
static atomic_uint once;
ggml_once(&once, ggml_vec_gelu_f16_init);
const uint16_t * i16 = (const uint16_t *) x;
for (int i = 0; i < n; ++i) {
y[i] = ggml_table_gelu_f16[i16[i]];
}
}

void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
for (int i = 0; i < n; ++i) {
y[i] = ggml_gelu_f32(x[i]);
Expand Down Expand Up @@ -1424,32 +1404,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
const __m512 r = _mm512_set1_ps(0x1.8p23f);
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
const __m512 n = _mm512_sub_ps(z, r);
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
const __m512 u = _mm512_mul_ps(b, b);
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
_mm512_set1_ps(0x1.573e2ep-5f)), u,
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
_mm512_set1_ps(0x1.fffdb6p-2f))),
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
if (_mm512_kortestz(c, c))
return _mm512_fmadd_ps(j, k, k);
const __m512i g = _mm512_and_si512(
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
_mm512_set1_epi32(0x82000000u));
const __m512 s1 =
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
const __m512 b =
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
const __mmask16 d =
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
return _mm512_mask_blend_ps(
d, _mm512_mask_blend_ps(
c, _mm512_fmadd_ps(k, j, k),
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
_mm512_mul_ps(s1, s1));
const __m512 u = _mm512_mul_ps(b, b);
const __m512 j = _mm512_fmadd_ps(
_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
_mm512_set1_ps(0x1.573e2ep-5f)),
u,
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
_mm512_set1_ps(0x1.fffdb6p-2f))),
u,
_mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
const __m512 res = _mm512_scalef_ps(j, n);
if (_mm512_kortestz(d, d))
return res;
const __m512 zero = _mm512_setzero_ps();
const __m512 alt = _mm512_mask_blend_ps(
_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
return _mm512_mask_blend_ps(d, res, alt);
}

// computes silu x/(1+exp(-x)) in single precision vector
Expand Down
Loading

0 comments on commit af22695

Please sign in to comment.