3RNN/Lib/site-packages/tensorflow/include/xla/ef57.h
2024-05-26 19:49:15 +02:00

67 lines
2.8 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Copyright 2017 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef XLA_EF57_H_
#define XLA_EF57_H_
#include <cmath>
#include <utility>
#include "absl/types/span.h"
namespace xla {
// Utility function to split a double-precision float (F64) into a pair of F32s.
// For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
// algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
// value 'lo'. See Theorem 4 in [1] (attributed to Dekker) or [2] for the
// original theorem by Dekker.
//
// For double-precision F64s, which contain a 53 bit mantissa (52 of them
// explicit), we can represent the most significant 49 digits as the unevaluated
// sum of two single-precision floats 'hi' and 'lo'. The 'hi' float stores the
// most significant 24 bits and the sign bit of 'lo' together with its mantissa
// store the remaining 25 bits. The exponent of the resulting representation is
// still restricted to 8 bits of F32.
//
// References:
// [1] A. Thall, Extended-Precision Floating-Point Numbers for GPU Computation,
// SIGGRAPH Research Posters, 2006.
// (http://andrewthall.org/papers/df64_qf128.pdf)
// [2] T. J. Dekker, A floating point technique for extending the available
// precision, Numerische Mathematik, vol. 18, pp. 224242, 1971.
inline std::pair<float, float> SplitF64ToF32(double x) {
const float x_f32 = static_cast<float>(x);
const bool result_is_finite = std::isfinite(x_f32);
// The high float is simply the double rounded to the nearest float. Because
// we are rounding to nearest with ties to even, the error introduced in
// rounding is less than half an ULP in the high ULP.
const float hi = x_f32;
// We can compute the low term using Sterbenz' lemma: If a and b are two
// positive floating point numbers and a/2 ≤ b ≤ 2a, then their difference can
// be computed exactly.
// Note: the difference is computed exactly but is rounded to the nearest
// float which will introduce additional error.
const float lo = static_cast<float>(x - static_cast<double>(hi));
return std::make_pair(hi, result_is_finite ? lo : 0.0f);
}
void ConvertF64ToEf57(absl::Span<const double> input, absl::Span<float> output);
} // namespace xla
#endif // XLA_EF57_H_