自编以e为底的指数函数exp，性能接近标准库函数

算法描述：
(1). 先做自变量x的范围检查，对于双精度浮点数，自变量不能超出(-1022ln2, 1024ln2)=(-708.39, 709.78)，否则exp(x)会溢出。对于单精度浮点数，自变量不能超出(-126ln2, 128ln2)=(-87.33, 88.72). 自己使用此函数时，如果能通过其它途径保证自变量不会超出前述范围，那么可以省略这两个判断，提高速度。
(2). 令 x=k1·ln2+b1, 那么 $e^x=2^{k_1}\cdot e^{b_1}$ ，其中k1为整数，可正可负， $k_1=floor(\frac{x}{\ln 2}),\ b_1\in[0,\ln2)$ ，但是floor函数的调用代价极大（为什么floor()这么慢？-腾讯云，glibc/sysdeps/ieee754/dbl-64/s_floor.c GitHub），所以改用强制类型转换static_cast获得k1.
(3). b1的范围还是太大，如果直接对b1使用e^x在x=0处的泰勒级数，即使使用阶数很高的泰勒级数，精度也不够高，所以再做变换，令 $b_1=k_2 \cdot \frac{\ln2}{128} + b_2,\ \ k_2=0,1,2,\cdots,127,\ \ b_2\in [0,\frac{\ln2}{128} )$ ，把 $\exp(k_2 \cdot \frac{\ln2}{128})$ 用数组存起来，然后对b2使用 e^x 在 x=0 处的泰勒级数：

$e^{b_2}=1+b_2+\frac{b_2^2}{2!}+\frac{b_2^3}{3!}+\frac{b_2^4}{4!}+\frac{b_2^5}{5!}$

多项式求值采用秦九韶算法，同时还使用fmadd指令加速运算(融合乘加，intel _mm_fmadd_sd )
(4). 不要用浮点乘法计算 $2^{k_1}\cdot e^{b_1}$ ，而是利用IEEE 754浮点数的格式，直接用位运算修改浮点数的阶码，不仅速度快，而且没有乘法的误差累积。

标准库的算法可参考：https://github.com/bminor/glibc/blob/master/sysdeps/ieee754/dbl-64/e_exp.c

最终的效果是，精度稍差于标准库，特别是自变量比较大时。速度十分接近标准库，当抛弃判断自变量是否在区间(-708.39, 709.78)内时，以及不对NAN情况进行处理，速度甚至稍快于标准库。

C++代码如下：

#include<stdio.h>
#include<math.h>
#include<time.h>
#include<immintrin.h>#define FMADDconstexpr double ln2 = 0.6931471805599453;
constexpr double ln2_128 = 0.0054152123481245727; // =(ln2)/128
constexpr double d1_ln2 = 1.4426950408889634;     // =1/ln2
constexpr double d128_ln2 = 184.6649652337873;    // =128/ln2// DBL_MIN = 2^(-1022)=2.2250738585072014e-308，  
// DBL_MAX = 2^1024   =1.7976931348623158e+308，两者均在float.h中定义
constexpr double ln_DBL_MIN = -708.39641853226411;  // =ln(DBL_MIN)= -1022*ln(2)
constexpr double ln_DBL_MAX = 709.78271289338400;   // =ln(DBL_MAX)=  1024*ln(2)__m128d c6 = _mm_set_sd(1.0 / 720.0);
__m128d c5 = _mm_set_sd(1.0 / 120.0);
__m128d c4 = _mm_set_sd(1.0 / 24.0);
__m128d c3 = _mm_set_sd(1.0 / 6.0);
__m128d c2 = _mm_set_sd(1.0 / 2.0);
__m128d c1 = _mm_set_sd(1.0);//exp(0),exp(ln2/128),exp(2*ln2/128),exp(3*ln2/128),...exp(127*ln2/128),
double expln2_128Array[128] = {
1.0,
1.0054299011128028, 1.0108892860517005, 1.0163783149109530,
1.0218971486541167, 1.0274459491187637, 1.0330248790212284,
1.0386341019613788, 1.0442737824274138, 1.0499440858006873,
1.0556451783605572, 1.0613772272892621, 1.0671404006768236,
1.0729348675259756, 1.0787607977571198, 1.0846183622133092,
1.0905077326652577, 1.0964290818163768, 1.1023825833078409,
1.1083684117236786, 1.1143867425958925, 1.1204377524096067,
1.1265216186082419, 1.1326385195987192, 1.1387886347566917,
1.1449721444318042, 1.1511892299529827, 1.1574400736337510,
1.1637248587775775, 1.1700437696832502, 1.1763969916502813,
1.1827847109843410, 1.1892071150027211, 1.1956643920398274,
1.2021567314527031, 1.2086843236265816, 1.2152473599804689,
1.2218460329727575, 1.2284805361068700, 1.2351510639369333,
1.2418578120734840, 1.2486009771892047, 1.2553807570246911,
1.2621973503942507, 1.2690509571917332, 1.2759417783963921,
1.2828700160787783, 1.2898358734066658, 1.2968395546510097,
1.3038812651919359, 1.3109612115247643, 1.3180796012660640,
1.3252366431597413, 1.3324325470831614, 1.3396675240533030,
1.3469417862329458, 1.3542555469368927, 1.3616090206382248,
1.3690024229745906, 1.3764359707545301, 1.3839098819638320,
1.3914243757719262, 1.3989796725383111, 1.4065759938190154,
1.4142135623730950, 1.4218926021691656, 1.4296133383919700,
1.4373759974489823, 1.4451808069770466, 1.4530279958490526,
1.4609177941806470, 1.4688504333369819, 1.4768261459394993,
1.4848451658727525, 1.4929077282912648, 1.5010140696264255,
1.5091644275934227, 1.5173590411982147, 1.5255981507445383,
1.5338819978409560, 1.5422108254079408, 1.5505848776850000,
1.5590044002378370, 1.5674696399655529, 1.5759808451078865,
1.5845382652524937, 1.5931421513422669, 1.6017927556826934,
1.6104903319492543, 1.6192351351948637, 1.6280274218573478,
1.6368674497669645, 1.6457554781539648, 1.6546917676561944,
1.6636765803267364, 1.6727101796415966, 1.6817928305074291,
1.6909247992693052, 1.7001063537185235, 1.7093377631004628,
1.7186192981224779, 1.7279512309618376, 1.7373338352737062,
1.7467673861991689, 1.7562521603732995, 1.7657884359332728,
1.7753764925265213, 1.7850166113189350, 1.7947090750031072,
1.8044541678066239, 1.8142521755003988, 1.8241033854070533,
1.8340080864093425, 1.8439665689586259, 1.8539791250833856,
1.8640460483977890, 1.8741676341102999, 1.8843441790323344,
1.8945759815869656, 1.9048633418176742, 1.9152065613971473,
1.9256059436361249, 1.9360617934922945, 1.9465744175792333,
1.9571441241754003, 1.9677712232331758, 1.9784560263879510,
1.9891988469672664 };double myexp(double x) {// 自己使用时如果确定x不会超出范围(-708.39, 709.78)，// 可以去掉下面两个判断，提高速度//if (x < ln_DBL_MIN) {//	return 0;//}//if (x > ln_DBL_MAX) {//	return INFINITY;//}long long k1;if (x < 0){// k1 = floor(x * d1_ln2 - 1)// floor函数的耗时远远多于static_castk1 = static_cast<long long>(x * d1_ln2 - 1);}else {k1 = static_cast<long long>(x * d1_ln2);}x = x - k1 * ln2;//此时必有x>0//int k2 = floor(x * d128_ln2);int k2 = static_cast<int>(x * d128_ln2);x = x - k2 * ln2_128;double t;
#ifdef FMADD__m128d t128 = c5;__m128d x128 = _mm_set_sd(x);t128 = _mm_fmadd_sd(t128, x128, c4);t128 = _mm_fmadd_sd(t128, x128, c3);t128 = _mm_fmadd_sd(t128, x128, c2);t128 = _mm_fmadd_sd(t128, x128, c1);t128 = _mm_fmadd_sd(t128, x128, c1);t = _mm_cvtsd_f64(t128);
#elset = 1.0 / 24.0 + x * 1.0 / 120.0;t = 1.0 / 6.0 + x * t;t = 1.0 / 2.0 + x * t;t = 1.0  + x * t;t = 1.0  + x * t;
#endift *= expln2_128Array[k2];unsigned long long LLT = *reinterpret_cast<unsigned long long*>(&t);LLT += (k1 << 52);t = *reinterpret_cast<double*>(&LLT);return t;
}int main() {printf("double,精度测试\n");for (double x = -3.0; x < 2.0; x += 0.3) {printf("myexp(%2.1f)=%18.16lf\n  exp(%2.1f)=%18.16lf\n-------\n", x, myexp(x), x, exp(x));}for (double x = 3; x < 20; x += 1.0) {printf("myexp(%2.1f)=%18.16lf\n  exp(%2.1f)=%18.16lf\n-------\n", x, myexp(x), x, exp(x));}//----------------------double x1 = -70.0, x2 = 70.0, dx = 1e-7;printf("速度测试，x1 = %4.2f, x2 = %4.2f, 编译器优化设为/O2, Core i7-11800H \n",x1,x2);clock_t start = clock();double sum = 0.0;for (double x = x1; x < x2; x += dx) {sum += myexp(x);}printf("sum=%lf,  myexp_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);start = clock();sum = 0.0;for (double x = x1; x < x2; x += dx) {sum += exp(x);}printf("sum=%lf,    exp_Time: %fs\n", sum, (double)(clock() - start) / CLOCKS_PER_SEC);
}

结果如下：

double型,精度测试
myexp(-3.0)=0.0497870683678639exp(-3.0)=0.0497870683678639
-------
myexp(-2.7)=0.0672055127397497exp(-2.7)=0.0672055127397498
-------
myexp(-2.4)=0.0907179532894125exp(-2.4)=0.0907179532894125
-------
myexp(-2.1)=0.1224564282529818exp(-2.1)=0.1224564282529818
-------
myexp(-1.8)=0.1652988882215864exp(-1.8)=0.1652988882215864
-------
myexp(-1.5)=0.2231301601484297exp(-1.5)=0.2231301601484297
-------
myexp(-1.2)=0.3011942119122020exp(-1.2)=0.3011942119122020
-------
myexp(-0.9)=0.4065696597405990exp(-0.9)=0.4065696597405989
-------
myexp(-0.6)=0.5488116360940263exp(-0.6)=0.5488116360940263
-------
myexp(-0.3)=0.7408182206817177exp(-0.3)=0.7408182206817177
-------
myexp(-0.0)=0.9999999999999997exp(-0.0)=0.9999999999999997
-------
myexp(0.3)=1.3498588075760027exp(0.3)=1.3498588075760027
-------
myexp(0.6)=1.8221188003905087exp(0.6)=1.8221188003905082
-------
myexp(0.9)=2.4596031111569490exp(0.9)=2.4596031111569490
-------
myexp(1.2)=3.3201169227365468exp(1.2)=3.3201169227365468
-------
myexp(1.5)=4.4816890703380636exp(1.5)=4.4816890703380636
-------
myexp(1.8)=6.0496474644129448exp(1.8)=6.0496474644129448
-------
myexp(3.0)=20.0855369231876679exp(3.0)=20.0855369231876679
-------
myexp(4.0)=54.5981500331442362exp(4.0)=54.5981500331442362
-------
myexp(5.0)=148.4131591025766284exp(5.0)=148.4131591025765999
-------
myexp(6.0)=403.4287934927352239exp(6.0)=403.4287934927351102
-------
myexp(7.0)=1096.6331584284587279exp(7.0)=1096.6331584284585006
-------
myexp(8.0)=2980.9579870417278471exp(8.0)=2980.9579870417283018
-------
myexp(9.0)=8103.0839275753914990exp(9.0)=8103.0839275753842230
-------
myexp(10.0)=22026.4657948067251709exp(10.0)=22026.4657948067178950
-------
myexp(11.0)=59874.1417151978530455exp(11.0)=59874.1417151978166657
-------
myexp(12.0)=162754.7914190039737150exp(12.0)=162754.7914190039155073
-------
myexp(13.0)=442413.3920089205494151exp(13.0)=442413.3920089204912074
-------
myexp(14.0)=1202604.2841647767927498exp(14.0)=1202604.2841647767927498
-------
myexp(15.0)=3269017.3724721106700599exp(15.0)=3269017.3724721106700599
-------
myexp(16.0)=8886110.5205078702419996exp(16.0)=8886110.5205078721046448
-------
myexp(17.0)=24154952.7535753361880779exp(17.0)=24154952.7535752989351749
-------
myexp(18.0)=65659969.1373304873704910exp(18.0)=65659969.1373305097222328
-------
myexp(19.0)=178482300.9631871581077576exp(19.0)=178482300.9631872475147247
-------
速度测试，x1 = -70.00, x2 = 70.00, 编译器优化设为/O2, Core i7-11800H
sum=25154387663949415049142870006785114112.000000,  myexp_Time: 5.674000s
sum=25154387663949410326776387137139900416.000000,    exp_Time: 5.728000s