L
Jetzt habe ich ein wenig optimiert...
Das Ergebnis ist, dass bis ca. n=28 Insertion-Sort schneller ist als Quicksort für -O0, und bis ca. n=72 Insertion-Sort schneller ist als Quicksort für -O3.
Die Überraschung ist also, dass -O3 den Code langsamer anstatt schneller macht.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
volatile size_t initial_seed = 123;
size_t total_hash = 23;
size_t rand1(size_t seed, size_t max)
{
size_t a = 16807;
return (a * seed) % max;
}
size_t my_rand(size_t max)
{
size_t r = rand1(initial_seed, max);
initial_seed = rand1(initial_seed, -1);
return r;
}
void rfill(size_t n, size_t m, size_t a[m][n])
{
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < n; j++)
{
a[i][j] = my_rand(51);
}
}
}
double average(size_t n, size_t a[])
{
// The average should be around 25.
double average = 0;
for (size_t i = 0; i < n; i++)
{
average += (double)a[i] / (double)n;
}
return average;
}
// Fast, combining hash function with few collisions and h(a,b) often not equal to h(b,a)...
size_t my_hash(size_t a, size_t b)
{
return a * 31 + b;
}
void add_to_total_hash(size_t n, size_t m, size_t a[m][n])
{
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < n; j++)
{
total_hash = my_hash(total_hash, a[i][j]);
}
}
}
void print(size_t n, size_t m, size_t a[m][n])
{
add_to_total_hash(n, m, a);
for (size_t i = 0; i < n; i++)
{
printf("%zu ", a[0][i]);
}
printf("\na: %f\n...\n", average(n, a[0]));
// ...
for (size_t i = 0; i < n; i++)
{
printf("%zu ", a[m - 1][i]);
}
printf("\na: %f\nh: %zu\n\n", average(n, a[m - 1]), total_hash);
}
// Comparison function for sort_a
int compare(const void *a, const void *b)
{
return (*(size_t *)a - *(size_t *)b);
}
// Quicksort
void sort_a_quick(size_t n, size_t arr[])
{
qsort(arr, n, sizeof(size_t), compare);
}
// Bubble sort
void sort_b_bubble(size_t n, size_t arr[])
{
size_t i = 0, j = 1, a, b;
while (j < n)
{
a = arr[i];
b = arr[j];
if (a > b)
{
arr[i] = b;
arr[j] = a;
i = 0;
j = 1;
continue;
}
i++;
j++;
}
}
// Selection sort
void sort_c_selection(size_t n, size_t arr[])
{
for (size_t i = 0; i < n - 1; i++)
{
size_t a = i;
size_t b = arr[i];
for (size_t j = i + 1; j < n; j++)
{
if (arr[j] < b)
{
a = j;
b = arr[j];
}
}
arr[a] = arr[i];
arr[i] = b;
}
}
// Insertion sort
void sort_d_insertion(size_t n, size_t arr[])
{
size_t i = 0, j = 1, k, l, a, b;
while (j < n)
{
a = arr[i];
b = arr[j];
if (a > b)
{
k = i;
l = j;
do
{
arr[k] = b;
arr[l] = a;
if (k == 0)
{
break;
}
k--;
l--;
a = arr[k];
b = arr[l];
} while (a > b);
}
i++;
j++;
}
}
#pragma GCC push_options
#pragma GCC optimize("O0")
void sort_e_insertion_optimized(size_t n, size_t arr[])
{
asm volatile(
" movq %%rdi, -56(%%rbp) \n"
" movq %%rsi, -64(%%rbp) \n"
" movq $0, -8(%%rbp) \n"
" movq $1, -16(%%rbp) \n"
" movq -16(%%rbp), %%rax \n"
"1: \n"
" movq -8(%%rbp), %%rax \n"
" leaq 0(,%%rax,8), %%rdx \n"
" movq -64(%%rbp), %%rax \n"
" addq %%rdx, %%rax \n"
" movq (%%rax), %%rax \n"
" movq %%rax, -40(%%rbp) \n"
" movq -16(%%rbp), %%rax \n"
" leaq 0(,%%rax,8), %%rdx \n"
" movq -64(%%rbp), %%rax \n"
" addq %%rdx, %%rax \n"
" movq (%%rax), %%rax \n"
" movq %%rax, -48(%%rbp) \n"
" movq -40(%%rbp), %%rax \n"
" cmpq %%rax, -48(%%rbp) \n"
" jnb 3f \n"
" movq -8(%%rbp), %%rax \n"
" movq %%rax, -24(%%rbp) \n"
" movq -16(%%rbp), %%rax \n"
" movq %%rax, -32(%%rbp) \n"
"2: \n"
" movq -24(%%rbp), %%rax \n"
" leaq 0(,%%rax,8), %%rdx \n"
" movq -64(%%rbp), %%rax \n"
" addq %%rax, %%rdx \n"
" movq -48(%%rbp), %%rax \n"
" movq %%rax, (%%rdx) \n"
" movq -32(%%rbp), %%rax \n"
" leaq 0(,%%rax,8), %%rdx \n"
" movq -64(%%rbp), %%rax \n"
" addq %%rax, %%rdx \n"
" movq -40(%%rbp), %%rax \n"
" movq %%rax, (%%rdx) \n"
" cmpq $0, -24(%%rbp) \n"
" je 3f \n"
" subq $1, -24(%%rbp) \n"
" subq $1, -32(%%rbp) \n"
" movq -24(%%rbp), %%rax \n"
" leaq 0(,%%rax,8), %%rdx \n"
" movq -64(%%rbp), %%rax \n"
" addq %%rdx, %%rax \n"
" movq (%%rax), %%rax \n"
" movq %%rax, -40(%%rbp) \n"
" movq -32(%%rbp), %%rax \n"
" leaq 0(,%%rax,8), %%rdx \n"
" movq -64(%%rbp), %%rax \n"
" addq %%rdx, %%rax \n"
" movq (%%rax), %%rax \n"
" movq %%rax, -48(%%rbp) \n"
" movq -40(%%rbp), %%rax \n"
" cmpq %%rax, -48(%%rbp) \n"
" jb 2b \n"
"3: \n"
" addq $1, -8(%%rbp) \n"
" addq $1, -16(%%rbp) \n"
" movq -16(%%rbp), %%rax \n"
" cmpq -56(%%rbp), %%rax \n"
" jb 1b \n"
: /* No outputs. */
: /* No inputs. */
: "memory");
}
#pragma GCC pop_options
double profile_sort_func(void (*f)(size_t, size_t *), size_t n, size_t m, size_t arr[m][n])
{
rfill(n, m, arr);
print(n, m, arr);
clock_t c1 = clock();
for (size_t i = 0; i < m; i++)
{
(*f)(n, arr[i]);
}
clock_t c2 = clock();
print(n, m, arr);
return (double)(c2 - c1) / CLOCKS_PER_SEC;
}
int get_sort_winner(size_t n, size_t m)
{
size_t arr[m][n];
double secs[5] = {
profile_sort_func(sort_a_quick, n, m, arr),
profile_sort_func(sort_b_bubble, n, m, arr),
profile_sort_func(sort_c_selection, n, m, arr),
profile_sort_func(sort_d_insertion, n, m, arr),
profile_sort_func(sort_e_insertion_optimized, n, m, arr),
};
int min_index = 0;
double min = secs[0], max = secs[0];
for (size_t i = 0; i < 5; i++)
{
if (secs[i] < min)
{
min_index = i;
min = secs[i];
}
if (secs[i] > max)
{
max = secs[i];
}
}
printf("%fs %f%%\n%fs %f%%\n%fs %f%%\n%fs %f%%\n%fs %f%%\n",
secs[0], secs[0] * 100 / max,
secs[1], secs[1] * 100 / max,
secs[2], secs[2] * 100 / max,
secs[3], secs[3] * 100 / max,
secs[4], secs[4] * 100 / max);
return min_index;
}
int main(int argc, char const *argv[])
{
size_t n = 5;
size_t m = 10000;
while (get_sort_winner(n, m) != 0)
{
n++;
}
}
Edit: In der Assemblercodeoptimierung habe ich aus der äußeren while-Schleife eine do-while-Schleife gemacht... das ist natürlich gefährlich. Außerdem habe ich nop's und unnötige Labels bzw. Sprünge entfernt.