Here's my current optimized code (SIMD with 16 doubles per vector):
// gcc -std=c11 -Wall -Wextra -pedantic -O3 -march=native -fopenmp -o simd16 simd16.c -lm
// time ./simd16 > out.pgm
#include <complex.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
typedef int32_t v16i32 __attribute__ ((vector_size (64)));
typedef int64_t v16i64 __attribute__ ((vector_size (128)));
typedef double v16f64 __attribute__ ((vector_size (128)));
static inline int64_t and(v16i64 a)
{
return
a[0] & a[1] & a[2] & a[3] &
a[4] & a[5] & a[6] & a[7] &
a[8] & a[9] & a[10] & a[11] &
a[12] & a[13] & a[14] & a[15];
}
static v16f64 m_compute_pixel
( const int32_t N
, const double R
, const v16f64 cx
, const v16f64 cy
)
{
const double R2 = R * R;
v16f64 x = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
v16f64 y = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
int32_t n = 0;
v16f64 x_old = x;
v16f64 y_old = y;
int n_old = n;
v16f64 x2 = x * x;
v16f64 y2 = y * y;
while (n < N && and(x2 + y2 <= 4.0))
{
x_old = x;
y_old = y;
n_old = n;
for (int i = 0; i < 8; ++i)
{
y = 2.0 * x * y + cy;
x = x2 - y2 + cx;
x2 = x * x;
y2 = y * y;
}
n = n + 8;
}
x = x_old;
y = y_old;
n = n_old;
x2 = x * x;
y2 = y * y;
v16f64 s;
for (int k = 0; k < 16; ++k)
{
int m = n;
while (m < N && x2[k] + y2[k] <= R2)
{
y[k] = 2.0 * x[k] * y[k] + cy[k];
x[k] = x2[k] - y2[k] + cx[k];
x2[k] = x[k] * x[k];
y2[k] = y[k] * y[k];
m = m + 1;
}
s[k] = m < N ? m + 1 - log2(0.5 * log(x2[k] + y2[k])) : -N;
}
return s;
}
int main()
{
const int S = 1 << 10;
const int N = 1 << 20;
const double R = 1 << 18;
unsigned char *image = malloc(S * S);
#pragma omp parallel for schedule(static, 1)
for (int j = 0; j < S >> 2; ++j)
{
for (int i = 0; i < S >> 2; ++i)
{
v16f64 cx, cy;
int k = 0;
for (int dj = 0; dj < 4; ++dj)
for (int di = 0; di < 4; ++di)
{
cx[k] = ((i << 2) + di + 0.5) / S * 4 - 2;
cy[k] = ((j << 2) + dj + 0.5) / S * 4 - 2;
k = k + 1;
}
const v16f64 mu = m_compute_pixel(N, R, cx, cy);
k = 0;
for (int dj = 0; dj < 4; ++dj)
for (int di = 0; di < 4; ++di)
{
image[((j << 2) + dj) * S + (i << 2) + di] = 16 * mu[k];
k = k + 1;
}
}
}
printf("P5\n%d %d\n255\n", S, S);
fwrite(image, S * S, 1, stdout);
free(image);
return 0;
}