好的,你有一个非常好的问题,我尝试运行一些基准测试。首先,我没有你的参数,所以我用了你的小例子。其次,由于您没有指定语言,因此我使用了 C + GSL(因为我对 C++ 不太熟悉)
#if __STDC_VERSION__ >= 199901L
#define _XOPEN_SOURCE 700
#else
#define _XOPEN_SOURCE 600
#endif /* __STDC_VERSION__ */
#include <time.h>
#include <gsl/gsl_vector.h>
#include <stdio.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_complex.h>
#include <gsl/gsl_complex_math.h>
#include <gsl/gsl_blas.h>
gsl_vector_complex *A;
gsl_vector_complex *B;
gsl_vector_complex *C;
void evaluate(gsl_vector_complex *in, gsl_vector_complex *out)
{
gsl_complex n_unity;
gsl_complex unity;
gsl_complex zero;
GSL_SET_COMPLEX (&n_unity, -1, 0);
GSL_SET_COMPLEX (&unity, 1, 0);
GSL_SET_COMPLEX (&zero, 0, 0);
const gsl_complex neg_x8 = gsl_complex_mul (n_unity, gsl_vector_complex_get (in, 8));
gsl_vector_complex_set (A, 0, neg_x8);
gsl_vector_complex_set (A, 1, neg_x8);
gsl_vector_complex_set (A, 2, neg_x8);
gsl_vector_complex_set (A, 3, gsl_vector_complex_get (in, 8));
gsl_vector_complex_set (A, 4, gsl_vector_complex_get (out, 4));
gsl_vector_complex_set (A, 5, neg_x8);
gsl_vector_complex_set (A, 6, gsl_vector_complex_get (in, 8));
gsl_vector_complex_set (A, 7, gsl_vector_complex_get (out, 7));
gsl_vector_complex_set (A, 8, gsl_vector_complex_get (in, 4));
gsl_vector_complex_set (A, 9, gsl_vector_complex_get (in, 5));
gsl_vector_complex_set (A, 10, gsl_vector_complex_get (out, 10));
gsl_vector_complex_set (A, 11, gsl_vector_complex_get (in, 6));
gsl_vector_complex_set (A, 12, gsl_vector_complex_get (in, 6));
gsl_vector_complex_set (B, 0, unity);
gsl_vector_complex_set (B, 1, unity);
gsl_vector_complex_set (B, 2, gsl_vector_complex_get (in, 9));
gsl_vector_complex_set (B, 3, gsl_vector_complex_get (in, 6));
gsl_vector_complex_set (B, 4, unity);
gsl_vector_complex_set (B, 5, gsl_vector_complex_get (in, 0));
gsl_vector_complex_set (B, 6, gsl_vector_complex_get (in, 7));
gsl_vector_complex_set (B, 7, unity);
gsl_vector_complex_set (B, 8, gsl_vector_complex_get (in, 4));
gsl_vector_complex_set (B, 9, gsl_vector_complex_get (in, 5));
gsl_vector_complex_set (B, 10, unity);
gsl_vector_complex_set (B, 11, gsl_vector_complex_get (in, 6));
gsl_vector_complex_set (B, 12, gsl_vector_complex_get (in, 6));
gsl_vector_complex_set_basis (C, 1);
gsl_vector_complex_set (C, 2, gsl_vector_complex_get (in, 9));
gsl_vector_complex_set (C, 5, gsl_vector_complex_get (in, 0));
gsl_vector_complex_set (C, 12, gsl_vector_complex_get (in, 6));
gsl_vector_complex_mul (A, B);
gsl_blas_zaxpy (unity, C, A);
gsl_vector_complex_memcpy (out, A);
}
void evaluate_naive(gsl_vector_complex *in, gsl_vector_complex *out)
{
gsl_complex n_unity;
gsl_complex unity;
gsl_complex zero;
GSL_SET_COMPLEX (&n_unity, -1, 0);
GSL_SET_COMPLEX (&unity, 1, 0);
GSL_SET_COMPLEX (&zero, 0, 0);
const gsl_complex neg_x8 = gsl_complex_mul (n_unity, gsl_vector_complex_get (in, 8));
gsl_vector_complex_set (out, 0, neg_x8);
gsl_vector_complex_set (out, 1, gsl_complex_add (unity, neg_x8));
gsl_vector_complex_set (out, 2, gsl_complex_sub (gsl_vector_complex_get (in, 9), gsl_complex_mul (neg_x8, gsl_vector_complex_get (in, 9))));
gsl_vector_complex_set (out, 3, gsl_complex_mul (gsl_vector_complex_get (in, 8), gsl_vector_complex_get (in, 6)));
gsl_vector_complex_set (out, 5, gsl_complex_sub (gsl_vector_complex_get (in, 0), gsl_complex_mul (neg_x8, gsl_vector_complex_get (in, 0))));
gsl_vector_complex_set (out, 6, gsl_complex_mul (gsl_vector_complex_get (in, 8), gsl_vector_complex_get (in, 7)));
gsl_vector_complex_set (out, 8, gsl_complex_mul (gsl_vector_complex_get (in, 4), gsl_vector_complex_get (in, 4)));
gsl_vector_complex_set (out, 9, gsl_complex_mul (gsl_vector_complex_get (in, 5), gsl_vector_complex_get (in, 5)));
gsl_vector_complex_set (out, 11, gsl_complex_mul (gsl_vector_complex_get (in, 6), gsl_vector_complex_get (in, 6)));
gsl_vector_complex_set (out, 12, gsl_complex_add (gsl_vector_complex_get (out, 11), gsl_vector_complex_get (in, 6)));
}
int main()
{
struct timespec start, end;
size_t time_spent;
gsl_vector_complex *x;
gsl_vector_complex *y;
double x_,y_;
gsl_complex val;
const gsl_rng_type * T;
gsl_rng * r;
gsl_rng_env_setup();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
x = gsl_vector_complex_calloc (13);
y = gsl_vector_complex_calloc (13);
A = gsl_vector_complex_calloc (13);
B = gsl_vector_complex_calloc (13);
C = gsl_vector_complex_calloc (13);
for (size_t iteration = 0; iteration < 100000000; iteration++)
{
for (size_t index = 0; index < 13; index++)
{
x_ = gsl_ran_gaussian_ziggurat (r, 1.0);
y_ = gsl_ran_gaussian_ziggurat (r, 1.0);
GSL_SET_COMPLEX(&val, x_, y_);
gsl_vector_complex_set (x, index, val);
}
for (size_t index = 0; index < 13; index++)
{
x_ = gsl_ran_gaussian_ziggurat (r, 1.0);
y_ = gsl_ran_gaussian_ziggurat (r, 1.0);
GSL_SET_COMPLEX(&val, x_, y_);
gsl_vector_complex_set (y, index, val);
}
clock_gettime (CLOCK_MONOTONIC, &start);
#ifdef NAIVE
evaluate_naive (x,y);
#else
evaluate (x, y);
#endif
clock_gettime (CLOCK_MONOTONIC, &end);
time_spent = (end.tv_sec - start.tv_sec) * 1000000000 + (end.tv_nsec - start.tv_nsec);
printf("%e\n", 1.0 * time_spent);
}
gsl_rng_free (r);
gsl_vector_complex_free(x);
gsl_vector_complex_free(y);
gsl_vector_complex_free(A);
gsl_vector_complex_free(B);
gsl_vector_complex_free(C);
return 0;
}
我尝试了矢量化和幼稚的实现,这是我的结果:
gcc, no optimizations, naive version, mean = 270.6; std = 190.6 ns
gcc, no optimizations, vector version, mean = 318.1; std = 381.1 ns
gcc, optimized, naive version, mean = 106.9; std = 160.3 ns
gcc, optimized, vector version, mean = 154.5; std = 109.6 ns
gcc, optimized, machine-specific, naive version, mean = 105.2; std = 96.2 ns
gcc, optimized, machine-specific, vector version, mean = 154.8; std = 144.8 ns
icc, optimized, naive version, mean = 75.6; std = 99.4 ns
icc, optimized, vector version, mean = 164.6; std = 209.9 ns
icc, optimized, machine-specific, naive version, mean = 75.8; std = 189.5 ns
icc, optimized, machine-specific, vector version, mean = 166.9; std = 1850.2 ns // this std is somehow bad, however, I used my laptop to run the tests
icc, no optimizations, naive version, mean = 130.9; std = 153.6 ns
icc, no optimizations, vector version, mean = 266.6; std = 238.8 ns
QED 如果您不确定该怎么做,让编译器 (icc) 发挥它的黑魔法。这种情况可能是特定于平台的,此外,当您增加数组的数量/大小时,时间可能会发生变化。PS 还有一种可能的解决方案可以尝试:Intel 提供了自己的元素向量操作函数,可以改善结果,但是,您必须重写代码。PPS 实际上,我完全忘记了默认情况下 icc 不遵循 IEEE 标准(相当于 -ffast-math),并且可以极快地为您提供极不正确的结果。意识到。