Skip to content

Commit

Permalink
Merge pull request #2265 from johnplatts:hwy_ceil_floor_int_062824
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 648309403
  • Loading branch information
Copybara-Service committed Jul 1, 2024
2 parents 4120d6f + 02031df commit ebab10b
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 8 deletions.
10 changes: 10 additions & 0 deletions g3doc/quick_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1710,6 +1710,16 @@ All functions except `Stream` are defined in cache_control.h.
<code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
results are undefined for NaN.

* `V`: `f`; `Ret`: `Vec<RebindToSigned<DFromV<V>>>` \
<code>Ret **CeilInt**(V a)</code>: equivalent to
`ConvertTo(RebindToSigned<DFromV<V>>(), Ceil(a))`, but `CeilInt(a)` is more
efficient on some targets, including SSE2, SSSE3, and AArch64 NEON.

* `V`: `f`; `Ret`: `Vec<RebindToSigned<DFromV<V>>>` \
<code>Ret **FloorInt**(V a)</code>: equivalent to
`ConvertTo(RebindToSigned<DFromV<V>>(), Floor(a))`, but `FloorInt(a)` is
more efficient on some targets, including SSE2, SSSE3, and AArch64 NEON.

* `D`: `i32`, `V`: `f64`
<code>Vec&lt;D&gt; **DemoteToNearestInt**(D d, V v)</code>: converts `v[i]`
to `TFromD<D>`, rounding to nearest (with ties to even).
Expand Down
83 changes: 83 additions & 0 deletions hwy/ops/arm_neon-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -5187,6 +5187,89 @@ HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {

#endif

// ------------------------------ CeilInt/FloorInt
#if HWY_ARCH_ARM_A64

#ifdef HWY_NATIVE_CEIL_FLOOR_INT
#undef HWY_NATIVE_CEIL_FLOOR_INT
#else
#define HWY_NATIVE_CEIL_FLOOR_INT
#endif

#if HWY_HAVE_FLOAT16
HWY_API Vec128<int16_t> CeilInt(const Vec128<float16_t> v) {
return Vec128<int16_t>(vcvtpq_s16_f16(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
HWY_API Vec128<int16_t, N> CeilInt(const Vec128<float16_t, N> v) {
return Vec128<int16_t, N>(vcvtp_s16_f16(v.raw));
}

HWY_API Vec128<int16_t> FloorInt(const Vec128<float16_t> v) {
return Vec128<int16_t>(vcvtmq_s16_f16(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
HWY_API Vec128<int16_t, N> FloorInt(const Vec128<float16_t, N> v) {
return Vec128<int16_t, N>(vcvtm_s16_f16(v.raw));
}
#endif // HWY_HAVE_FLOAT16

HWY_API Vec128<int32_t> CeilInt(const Vec128<float> v) {
return Vec128<int32_t>(vcvtpq_s32_f32(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<int32_t, N> CeilInt(const Vec128<float, N> v) {
return Vec128<int32_t, N>(vcvtp_s32_f32(v.raw));
}

HWY_API Vec128<int64_t> CeilInt(const Vec128<double> v) {
return Vec128<int64_t>(vcvtpq_s64_f64(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
HWY_API Vec128<int64_t, N> CeilInt(const Vec128<double, N> v) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
// Workaround for missing vcvtp_s64_f64 intrinsic
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
const Twice<decltype(d)> dt;
return LowerHalf(di, CeilInt(Combine(dt, v, v)));
#else
return Vec128<int64_t, N>(vcvtp_s64_f64(v.raw));
#endif
}

HWY_API Vec128<int32_t> FloorInt(const Vec128<float> v) {
return Vec128<int32_t>(vcvtmq_s32_f32(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
HWY_API Vec128<int32_t, N> FloorInt(const Vec128<float, N> v) {
return Vec128<int32_t, N>(vcvtm_s32_f32(v.raw));
}

HWY_API Vec128<int64_t> FloorInt(const Vec128<double> v) {
return Vec128<int64_t>(vcvtmq_s64_f64(v.raw));
}

template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
HWY_API Vec128<int64_t, N> FloorInt(const Vec128<double, N> v) {
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
// Workaround for missing vcvtm_s64_f64 intrinsic
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
const Twice<decltype(d)> dt;
return LowerHalf(di, FloorInt(Combine(dt, v, v)));
#else
return Vec128<int64_t, N>(vcvtm_s64_f64(v.raw));
#endif
}

#endif // HWY_ARCH_ARM_A64

// ------------------------------ NearestInt (Round)

#if HWY_HAVE_FLOAT16
Expand Down
24 changes: 24 additions & 0 deletions hwy/ops/generic_ops-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -946,6 +946,30 @@ HWY_API MFromD<D> IsFinite(const V v) {

#endif // HWY_NATIVE_ISINF

// ------------------------------ CeilInt/FloorInt
#if (defined(HWY_NATIVE_CEIL_FLOOR_INT) == defined(HWY_TARGET_TOGGLE))
#ifdef HWY_NATIVE_CEIL_FLOOR_INT
#undef HWY_NATIVE_CEIL_FLOOR_INT
#else
#define HWY_NATIVE_CEIL_FLOOR_INT
#endif

template <class V, HWY_IF_FLOAT_V(V)>
HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
return ConvertTo(di, Ceil(v));
}

template <class V, HWY_IF_FLOAT_V(V)>
HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
const DFromV<decltype(v)> d;
const RebindToSigned<decltype(d)> di;
return ConvertTo(di, Floor(v));
}

#endif // HWY_NATIVE_CEIL_FLOOR_INT

// ------------------------------ LoadInterleaved2

#if HWY_IDE || \
Expand Down
32 changes: 32 additions & 0 deletions hwy/ops/x86_128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11949,6 +11949,25 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
return IfThenElse(detail::UseInt(v), int_f - neg1, v);
}

#ifdef HWY_NATIVE_CEIL_FLOOR_INT
#undef HWY_NATIVE_CEIL_FLOOR_INT
#else
#define HWY_NATIVE_CEIL_FLOOR_INT
#endif

template <class V, HWY_IF_FLOAT_V(V)>
HWY_API VFromD<RebindToSigned<DFromV<V>>> CeilInt(V v) {
const DFromV<decltype(v)> df;
const RebindToSigned<decltype(df)> di;

const auto integer = ConvertTo(di, v); // round toward 0
const auto int_f = ConvertTo(df, integer);

// Truncating a positive non-integer ends up smaller; if so, add 1.
return integer -
VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f < v)));
}

// Toward -infinity, aka floor
template <typename T, size_t N>
HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
Expand All @@ -11965,6 +11984,19 @@ HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
return IfThenElse(detail::UseInt(v), int_f + neg1, v);
}

template <class V, HWY_IF_FLOAT_V(V)>
HWY_API VFromD<RebindToSigned<DFromV<V>>> FloorInt(V v) {
const DFromV<decltype(v)> df;
const RebindToSigned<decltype(df)> di;

const auto integer = ConvertTo(di, v); // round toward 0
const auto int_f = ConvertTo(df, integer);

// Truncating a negative non-integer ends up larger; if so, subtract 1.
return integer +
VecFromMask(di, RebindMask(di, And(detail::UseInt(v), int_f > v)));
}

#else

// Toward nearest integer, ties to even
Expand Down
57 changes: 49 additions & 8 deletions hwy/tests/float_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -395,18 +395,38 @@ HWY_NOINLINE void TestAllTrunc() {
struct TestCeil {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
const RebindToSigned<decltype(d)> di;
using TI = MakeSigned<T>;

size_t padded;
auto in = RoundTestCases(t, d, padded);
auto expected = AllocateAligned<T>(padded);
HWY_ASSERT(expected);
auto expected_int = AllocateAligned<TI>(padded);
HWY_ASSERT(expected && expected_int);

constexpr double kMinOutOfRangeVal = -static_cast<double>(LimitsMin<TI>());
static_assert(kMinOutOfRangeVal > 0.0,
"kMinOutOfRangeVal > 0.0 must be true");

for (size_t i = 0; i < padded; ++i) {
// Cast to double because ceil does not support _Float16.
expected[i] =
ConvertScalarTo<T>(std::ceil(ConvertScalarTo<double>(in[i])));
const double ceil_val = std::ceil(ConvertScalarTo<double>(in[i]));
expected[i] = ConvertScalarTo<T>(ceil_val);
if (ScalarIsNaN(ceil_val)) {
expected_int[i] = 0;
} else if (ScalarIsInf(ceil_val) || static_cast<double>(ScalarAbs(
ceil_val)) >= kMinOutOfRangeVal) {
expected_int[i] =
ScalarSignBit(ceil_val) ? LimitsMin<TI>() : LimitsMax<TI>();
} else {
expected_int[i] = ConvertScalarTo<TI>(ceil_val);
}
}
for (size_t i = 0; i < padded; i += Lanes(d)) {
HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
const auto v = Load(d, &in[i]);
HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(v));
HWY_ASSERT_VEC_EQ(di, &expected_int[i],
IfThenZeroElse(RebindMask(di, IsNaN(v)), CeilInt(v)));
}
}
};
Expand All @@ -418,18 +438,39 @@ HWY_NOINLINE void TestAllCeil() {
struct TestFloor {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
const RebindToSigned<decltype(d)> di;
using TI = MakeSigned<T>;

size_t padded;
auto in = RoundTestCases(t, d, padded);
auto expected = AllocateAligned<T>(padded);
HWY_ASSERT(expected);
auto expected_int = AllocateAligned<TI>(padded);
HWY_ASSERT(expected && expected_int);

constexpr double kMinOutOfRangeVal = -static_cast<double>(LimitsMin<TI>());
static_assert(kMinOutOfRangeVal > 0.0,
"kMinOutOfRangeVal > 0.0 must be true");

for (size_t i = 0; i < padded; ++i) {
// Cast to double because floor does not support _Float16.
expected[i] =
ConvertScalarTo<T>(std::floor(ConvertScalarTo<double>(in[i])));
const double floor_val = std::floor(ConvertScalarTo<double>(in[i]));
expected[i] = ConvertScalarTo<T>(floor_val);
if (ScalarIsNaN(floor_val)) {
expected_int[i] = 0;
} else if (ScalarIsInf(floor_val) ||
static_cast<double>(ScalarAbs(floor_val)) >=
kMinOutOfRangeVal) {
expected_int[i] =
ScalarSignBit(floor_val) ? LimitsMin<TI>() : LimitsMax<TI>();
} else {
expected_int[i] = ConvertScalarTo<TI>(floor_val);
}
}
for (size_t i = 0; i < padded; i += Lanes(d)) {
HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
const auto v = Load(d, &in[i]);
HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(v));
HWY_ASSERT_VEC_EQ(di, &expected_int[i],
IfThenZeroElse(RebindMask(di, IsNaN(v)), FloorInt(v)));
}
}
};
Expand Down

0 comments on commit ebab10b

Please sign in to comment.