Reintroduce fast math functions (#7495)

* Add fast fma functions * Use fast fma functions * Add fast pow function * Use fast pow function * Fix build * Remove fastFma * Avoid UB in fastPow On GCC with -O1 or -O2 optimizations, this new implementation generates identical assembly to the old union-based implementation
2024-10-01 14:35:15 -04:00
parent 860749a8a1
commit 121d608c3a
4 changed files with 30 additions and 15 deletions
--- a/include/interpolation.h
+++ b/include/interpolation.h
@@ -69,13 +69,13 @@ inline float hermiteInterpolate( float x0, float x1, float x2, float x3,

 inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 {
-	float frsq = x*x;
-	float frcu = frsq*v0;
-	float t1 = std::fma(v1, 3, v3);
+	float frsq = x * x;
+	float frcu = frsq * v0;
+	float t1 = v1 * 3.f + v3;

-	return (v1 + std::fma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) -
-		std::fma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 *
-		(1.0f / 6.0f) - 0.5f * v2) + frsq * std::fma(0.5f, v2, -v1));
+	return v1 + (0.5f * frcu + x) * (v2 - frcu * (1.0f / 6.0f) -
+		(t1 * (1.0f / 6.0f) - v0) * (1.0f / 3.0f)) + frsq * x * (t1 *
+		(1.0f / 6.0f) - 0.5f * v2) + frsq * (0.5f * v2 - v1);
 }


@@ -83,13 +83,13 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 inline float cosinusInterpolate( float v0, float v1, float x )
 {
 	const float f = ( 1.0f - cosf( x * F_PI ) ) * 0.5f;
-	return std::fma(f, v1 - v0, v0);
+	return f * (v1 - v0) + v0;
 }


 inline float linearInterpolate( float v0, float v1, float x )
 {
-	return std::fma(x, v1 - v0, v0);
+	return x * (v1 - v0) + v0;
 }


@@ -104,7 +104,7 @@ inline float optimalInterpolate( float v0, float v1, float x )
 	const float c2 = even * -0.004541102062639801;
 	const float c3 = odd * -1.57015627178718420;
 	
-	return std::fma(std::fma(std::fma(c3, z, c2), z, c1), z, c0);
+	return ((c3 * z + c2) * z + c1) * z + c0;
 }


@@ -121,7 +121,7 @@ inline float optimal4pInterpolate( float v0, float v1, float v2, float v3, float
 	const float c2 = even1 * -0.246185007019907091 + even2 * 0.24614027139700284;
 	const float c3 = odd1 * -0.36030925263849456 + odd2 * 0.10174985775982505;

-	return std::fma(std::fma(std::fma(c3, z, c2), z, c1), z, c0);
+	return ((c3 * z + c2) * z + c1) * z + c0;
 }


@@ -132,7 +132,7 @@ inline float lagrangeInterpolate( float v0, float v1, float v2, float v3, float
 	const float c1 = v2 - v0 * ( 1.0f / 3.0f ) - v1 * 0.5f - v3 * ( 1.0f / 6.0f );
 	const float c2 = 0.5f * (v0 + v2) - v1;
 	const float c3 = ( 1.0f/6.0f ) * ( v3 - v0 ) + 0.5f * ( v1 - v2 );
-	return std::fma(std::fma(std::fma(c3, x, c2), x, c1), x, c0);
+	return ((c3 * x + c2) * x + c1) * x + c0;
 }


--- a/include/lmms_math.h
+++ b/include/lmms_math.h
@@ -27,12 +27,13 @@

 #include <QtGlobal>
 #include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <cstdint>
+#include <cstring>

 #include "lmms_constants.h"
 #include "lmmsconfig.h"
-#include <cassert>

 namespace lmms
 {
@@ -96,6 +97,20 @@ static void roundAt(T& value, const T& where, const T& stepSize)
 	}
 }

+//! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
+inline double fastPow(double a, double b)
+{
+	double d;
+	std::int32_t x[2];
+
+	std::memcpy(x, &a, sizeof(x));
+	x[1] = static_cast<std::int32_t>(b * (x[1] - 1072632447) + 1072632447);
+	x[0] = 0;
+
+	std::memcpy(&d, x, sizeof(d));
+	return d;
+}
+

 //! returns 1.0f if val >= 0.0f, -1.0 else
 inline float sign(float val) 
--- a/plugins/Kicker/KickerOsc.h
+++ b/plugins/Kicker/KickerOsc.h
@@ -64,7 +64,7 @@ public:
 	{
 		for( fpp_t frame = 0; frame < frames; ++frame )
 		{
-			const double gain = 1 - std::pow((m_counter < m_length) ? m_counter / m_length : 1, m_env);
+			const double gain = 1 - fastPow((m_counter < m_length) ? m_counter / m_length : 1, m_env);
 			const sample_t s = ( Oscillator::sinSample( m_phase ) * ( 1 - m_noise ) ) + ( Oscillator::noiseSample( 0 ) * gain * gain * m_noise );
 			buf[frame][0] = s * gain;
 			buf[frame][1] = s * gain;
@@ -80,7 +80,7 @@ public:
 			m_FX.nextSample( buf[frame][0], buf[frame][1] );
 			m_phase += m_freq / sampleRate;

-			const double change = (m_counter < m_length) ? ((m_startFreq - m_endFreq) * (1 - std::pow(m_counter / m_length, m_slope))) : 0;
+			const double change = (m_counter < m_length) ? ((m_startFreq - m_endFreq) * (1 - fastPow(m_counter / m_length, m_slope))) : 0;
 			m_freq = m_endFreq + change;
 			++m_counter;
 		}
--- a/plugins/Monstro/Monstro.cpp
+++ b/plugins/Monstro/Monstro.cpp
@@ -858,7 +858,7 @@ inline sample_t MonstroSynth::calcSlope( int slope, sample_t s )
 {
 	if( m_parent->m_slope[slope] == 1.0f ) return s;
 	if( s == 0.0f ) return s;
-	return std::pow(s, m_parent->m_slope[slope]);
+	return fastPow(s, m_parent->m_slope[slope]);
 }