Merge pull request #2265 from johnplatts:hwy_ceil_floor_int_062824

PiperOrigin-RevId: 648309403
google · Jul 1, 2024 · ebab10b · ebab10b
2 parents 4120d6f + 02031df
commit ebab10b
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 8 deletions.
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
@@ -1710,6 +1710,16 @@ All functions except `Stream` are defined in cache_control.h.
     <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
     results are undefined for NaN.
 
+*   `V`: `f`; `Ret`: `Vec<RebindToSigned<DFromV<V>>>` \
+    <code>Ret **CeilInt**(V a)</code>: equivalent to
+    `ConvertTo(RebindToSigned<DFromV<V>>(), Ceil(a))`, but `CeilInt(a)` is more
+    efficient on some targets, including SSE2, SSSE3, and AArch64 NEON.
+
+*   `V`: `f`; `Ret`: `Vec<RebindToSigned<DFromV<V>>>` \
+    <code>Ret **FloorInt**(V a)</code>: equivalent to
+    `ConvertTo(RebindToSigned<DFromV<V>>(), Floor(a))`, but `FloorInt(a)` is
+    more efficient on some targets, including SSE2, SSSE3, and AArch64 NEON.
+
 *   `D`: `i32`, `V`: `f64`
     <code>Vec&lt;D&gt; **DemoteToNearestInt**(D d, V v)</code>: converts `v[i]`
     to `TFromD<D>`, rounding to nearest (with ties to even).

diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
@@ -5187,6 +5187,89 @@ HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
 
 #endif
 
+// ------------------------------ CeilInt/FloorInt
+#if HWY_ARCH_ARM_A64
+
+#ifdef HWY_NATIVE_CEIL_FLOOR_INT
+#undef HWY_NATIVE_CEIL_FLOOR_INT
+#else
+#define HWY_NATIVE_CEIL_FLOOR_INT
+#endif
+
+#if HWY_HAVE_FLOAT16
+HWY_API Vec128<int16_t> CeilInt(const Vec128<float16_t> v) {
+  return Vec128<int16_t>(vcvtpq_s16_f16(v.raw));
+}
+
+template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
+HWY_API Vec128<int16_t, N> CeilInt(const Vec128<float16_t, N> v) {
+  return Vec128<int16_t, N>(vcvtp_s16_f16(v.raw));
+}
+
+HWY_API Vec128<int16_t> FloorInt(const Vec128<float16_t> v) {
+  return Vec128<int16_t>(vcvtmq_s16_f16(v.raw));
+}
+
+template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
+HWY_API Vec128<int16_t, N> FloorInt(const Vec128<float16_t, N> v) {
+  return Vec128<int16_t, N>(vcvtm_s16_f16(v.raw));
+}
+#endif  // HWY_HAVE_FLOAT16
+
+HWY_API Vec128<int32_t> CeilInt(const Vec128<float> v) {
+  return Vec128<int32_t>(vcvtpq_s32_f32(v.raw));
+}
+
+template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
+HWY_API Vec128<int32_t, N> CeilInt(const Vec128<float, N> v) {
+  return Vec128<int32_t, N>(vcvtp_s32_f32(v.raw));
+}
+
+HWY_API Vec128<int64_t> CeilInt(const Vec128<double> v) {
+  return Vec128<int64_t>(vcvtpq_s64_f64(v.raw));
+}
+
+template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
+HWY_API Vec128<int64_t, N> CeilInt(const Vec128<double, N> v) {
+#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
+  // Workaround for missing vcvtp_s64_f64 intrinsic
+  const DFromV<decltype(v)> d;
+  const RebindToSigned<decltype(d)> di;
+  const Twice<decltype(d)> dt;
+  return LowerHalf(di, CeilInt(Combine(dt, v, v)));
+#else
+  return Vec128<int64_t, N>(vcvtp_s64_f64(v.raw));
+#endif
+}
+
+HWY_API Vec128<int32_t> FloorInt(const Vec128<float> v) {
+  return Vec128<int32_t>(vcvtmq_s32_f32(v.raw));
+}
+
+template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
+HWY_API Vec128<int32_t, N> FloorInt(const Vec128<float, N> v) {
+  return Vec128<int32_t, N>(vcvtm_s32_f32(v.raw));
+}
+
+HWY_API Vec128<int64_t> FloorInt(const Vec128<double> v) {
+  return Vec128<int64_t>(vcvtmq_s64_f64(v.raw));
+}
+
+template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
+HWY_API Vec128<int64_t, N> FloorInt(const Vec128<double, N> v) {
+#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
+  // Workaround for missing vcvtm_s64_f64 intrinsic
+  const DFromV<decltype(v)> d;
+  const RebindToSigned<decltype(d)> di;
+  const Twice<decltype(d)> dt;
+  return LowerHalf(di, FloorInt(Combine(dt, v, v)));
+#else
+  return Vec128<int64_t, N>(vcvtm_s64_f64(v.raw));
+#endif
+}
+
+#endif  // HWY_ARCH_ARM_A64
+
 // ------------------------------ NearestInt (Round)
 
 #if HWY_HAVE_FLOAT16

diff --git a/hwy/ops/generic_ops-inl.h b/hwy/ops/generic_ops-inl.h
@@ -946,6 +946,30 @@ HWY_API MFromD<D> IsFinite(const V v) {
 
 #endif  // HWY_NATIVE_ISINF
 
+// ------------------------------ CeilInt/FloorInt
+#if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_CEIL_FLOOR_INT
+#undef HWY_NATIVE_CEIL_FLOOR_INT
+#else
+#define HWY_NATIVE_CEIL_FLOOR_INT
+#endif
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
+  const DFromV<decltype(v)> d;
+  const RebindToSigned<decltype(d)> di;
+  return ConvertTo(di, Ceil(v));
+}
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
+  const DFromV<decltype(v)> d;
+  const RebindToSigned<decltype(d)> di;
+  return ConvertTo(di, Floor(v));
+}
+
+#endif  // HWY_NATIVE_CEIL_FLOOR_INT
+
 // ------------------------------ LoadInterleaved2
 
 #if HWY_IDE || \

diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
@@ -11949,6 +11949,25 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
   return IfThenElse(detail::UseInt(v), int_f - neg1, v);
 }
 
+#ifdef HWY_NATIVE_CEIL_FLOOR_INT
+#undef HWY_NATIVE_CEIL_FLOOR_INT
+#else
+#define HWY_NATIVE_CEIL_FLOOR_INT
+#endif
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
+  const DFromV<decltype(v)> df;
+  const RebindToSigned<decltype(df)> di;
+
+  const auto integer = ConvertTo(di, v);  // round toward 0
+  const auto int_f = ConvertTo(df, integer);
+
+  // Truncating a positive non-integer ends up smaller; if so, add 1.
+  return integer -
+         VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v)));
+}
+
 // Toward -infinity, aka floor
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
@@ -11965,6 +11984,19 @@ HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
   return IfThenElse(detail::UseInt(v), int_f + neg1, v);
 }
 
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
+  const DFromV<decltype(v)> df;
+  const RebindToSigned<decltype(df)> di;
+
+  const auto integer = ConvertTo(di, v);  // round toward 0
+  const auto int_f = ConvertTo(df, integer);
+
+  // Truncating a negative non-integer ends up larger; if so, subtract 1.
+  return integer +
+         VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v)));
+}
+
 #else
 
 // Toward nearest integer, ties to even

diff --git a/hwy/tests/float_test.cc b/hwy/tests/float_test.cc
@@ -395,18 +395,38 @@ HWY_NOINLINE void TestAllTrunc() {
 struct TestCeil {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T t, D d) {
+    const RebindToSigned<decltype(d)> di;
+    using TI = MakeSigned<T>;
+
     size_t padded;
     auto in = RoundTestCases(t, d, padded);
     auto expected = AllocateAligned<T>(padded);
-    HWY_ASSERT(expected);
+    auto expected_int = AllocateAligned<TI>(padded);
+    HWY_ASSERT(expected && expected_int);
+
+    constexpr double kMinOutOfRangeVal = -static_cast<double>(LimitsMin<TI>());
+    static_assert(kMinOutOfRangeVal > 0.0,
+                  "kMinOutOfRangeVal > 0.0 must be true");
 
     for (size_t i = 0; i < padded; ++i) {
       // Cast to double because ceil does not support _Float16.
-      expected[i] =
-          ConvertScalarTo<T>(std::ceil(ConvertScalarTo<double>(in[i])));
+      const double ceil_val = std::ceil(ConvertScalarTo<double>(in[i]));
+      expected[i] = ConvertScalarTo<T>(ceil_val);
+      if (ScalarIsNaN(ceil_val)) {
+        expected_int[i] = 0;
+      } else if (ScalarIsInf(ceil_val) || static_cast<double>(ScalarAbs(
+                                              ceil_val)) >= kMinOutOfRangeVal) {
+        expected_int[i] =
+            ScalarSignBit(ceil_val) ? LimitsMin<TI>() : LimitsMax<TI>();
+      } else {
+        expected_int[i] = ConvertScalarTo<TI>(ceil_val);
+      }
     }
     for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
+      const auto v = Load(d, &in[i]);
+      HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(v));
+      HWY_ASSERT_VEC_EQ(di, &expected_int[i],
+                        IfThenZeroElse(RebindMask(di, IsNaN(v)), CeilInt(v)));
     }
   }
 };
@@ -418,18 +438,39 @@ HWY_NOINLINE void TestAllCeil() {
 struct TestFloor {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T t, D d) {
+    const RebindToSigned<decltype(d)> di;
+    using TI = MakeSigned<T>;
+
     size_t padded;
     auto in = RoundTestCases(t, d, padded);
     auto expected = AllocateAligned<T>(padded);
-    HWY_ASSERT(expected);
+    auto expected_int = AllocateAligned<TI>(padded);
+    HWY_ASSERT(expected && expected_int);
+
+    constexpr double kMinOutOfRangeVal = -static_cast<double>(LimitsMin<TI>());
+    static_assert(kMinOutOfRangeVal > 0.0,
+                  "kMinOutOfRangeVal > 0.0 must be true");
 
     for (size_t i = 0; i < padded; ++i) {
       // Cast to double because floor does not support _Float16.
-      expected[i] =
-          ConvertScalarTo<T>(std::floor(ConvertScalarTo<double>(in[i])));
+      const double floor_val = std::floor(ConvertScalarTo<double>(in[i]));
+      expected[i] = ConvertScalarTo<T>(floor_val);
+      if (ScalarIsNaN(floor_val)) {
+        expected_int[i] = 0;
+      } else if (ScalarIsInf(floor_val) ||
+                 static_cast<double>(ScalarAbs(floor_val)) >=
+                     kMinOutOfRangeVal) {
+        expected_int[i] =
+            ScalarSignBit(floor_val) ? LimitsMin<TI>() : LimitsMax<TI>();
+      } else {
+        expected_int[i] = ConvertScalarTo<TI>(floor_val);
+      }
     }
     for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
+      const auto v = Load(d, &in[i]);
+      HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(v));
+      HWY_ASSERT_VEC_EQ(di, &expected_int[i],
+                        IfThenZeroElse(RebindMask(di, IsNaN(v)), FloorInt(v)));
     }
   }
 };