From 2f732ad98339b9ab2685d064b57cf7ffca163113 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Wed, 1 May 2024 00:43:22 +0100
Subject: [PATCH] Improved performance of the iterative functions like
vect_apply and vect_apply_if
---
.vscode/settings.json | 7 +++-
src/zvector.c | 95 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 90 insertions(+), 12 deletions(-)
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6272eb7..11acaa7 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -130,6 +130,11 @@
"array": "c",
"string": "c",
"string_view": "c",
- "vector": "c"
+ "vector": "c",
+ "__bit_reference": "c",
+ "__hash_table": "c",
+ "__split_buffer": "c",
+ "initializer_list": "c",
+ "unordered_map": "c"
}
}
diff --git a/src/zvector.c b/src/zvector.c
index cfad4d5..d4a7044 100644
--- a/src/zvector.c
+++ b/src/zvector.c
@@ -2391,11 +2391,45 @@ void vect_swap_range(ivector v, const zvect_index s1, const zvect_index e1,
}
// Let's swap items:
- register zvect_index i;
- for (register zvect_index j = s1; j <= (s1 + end); j++) {
- i = j - s1;
- void *temp = v->data[v->begin + j];
- v->data[v->begin + j] = v->data[v->begin + (s2 + i)];
+ register zvect_index i = 0;
+ register zvect_index j = s1;
+ // Let's check if we can unroll the loop:
+ if ((s1 + end) > 4) {
+ // Nice we can unroll the loop!
+ for (j = s1; j <= (s1 + end)-4; j += 4) {
+ i = j - s1;
+#if (COMPILER == COMPILER_GCC)
+ __builtin_prefetch(v->data[v->begin + j + 4]);
+ #pragma GCC unroll 4
+ for (register zvect_index k = 0; k < 4; k++) {
+ void *temp = v->data[v->begin + j + k];
+ v->data[v->begin + j + k] = v->data[v->begin + (s2 + i + k)];
+ v->data[v->begin + (s2 + i + k)] = temp;
+ }
+#else
+ void *temp = v->data[v->begin + j];
+ v->data[v->begin + j] = v->data[v->begin + (s2 + i)];
+ v->data[v->begin + (s2 + i)] = temp;
+
+ temp = v->data[v->begin + j + 1];
+ v->data[v->begin + j + 1] = v->data[v->begin + (s2 + i + 1)];
+ v->data[v->begin + (s2 + i + 1)] = temp;
+
+ temp = v->data[v->begin + j + 2];
+ v->data[v->begin + j + 2] = v->data[v->begin + (s2 + i + 2)];
+ v->data[v->begin + (s2 + i + 2)] = temp;
+
+ temp = v->data[v->begin + j + 3];
+ v->data[v->begin + j + 3] = v->data[v->begin + (s2 + i + 3)];
+ v->data[v->begin + (s2 + i + 3)] = temp;
+#endif
+ }
+ }
+ // let's process the rest of the items:
+ for (register zvect_index j2 = j; j2 <= (s1 + end); j2++) {
+ i = j2 - s1;
+ void *temp = v->data[v->begin + j2];
+ v->data[v->begin + j2] = v->data[v->begin + (s2 + i)];
v->data[v->begin + (s2 + i)] = temp;
}
@@ -3229,19 +3263,27 @@ void vect_apply(ivector v, void (*f)(void *)) {
// Process the vector:
// Check if we can do loop unrolling
- if ((p_vect_size(v) & 3) == 0) {
+ zvect_index vsize = p_vect_size(v);
+ register zvect_index i = 0;
+ if (vsize > 4) {
// Nice, we can do some unrolled apply:
- for (register zvect_index i = 0; i < p_vect_size(v); i+=4)
+ for (i = 0; i <= (vsize - 4); i+=4)
{
+#if (COMPILER == COMPILER_GCC)
+ #pragma GCC unroll 4
+ for (zvect_index x=i; x < i + 4; x++)
+ (*f)(v->data[v->begin + x]);
+#else
(*f)(v->data[v->begin + i]);
(*f)(v->data[v->begin + (i+1)]);
(*f)(v->data[v->begin + (i+2)]);
(*f)(v->data[v->begin + (i+3)]);
+#endif
}
- } else {
- for (register zvect_index i = p_vect_size(v); i--;)
- (*f)(v->data[v->begin + i]);
}
+ // Cleanup loop for remaining elements if any
+ for (; i < vsize; i++)
+ (*f)(v->data[v->begin + i]);
//VECT_APPLY_DONE_PROCESSING:
#if (ZVECT_THREAD_SAFE == 1)
@@ -3387,7 +3429,38 @@ void vect_apply_if(ivector v1, const_vector const v2, void (*f1)(void *),
}
// Process vectors:
- for (register zvect_index i = p_vect_size(v1); i--;)
+ //for (register zvect_index i = p_vect_size(v1); i--;)
+ // if ((*f2)(v1->data[v1->begin + i], v2->data[v2->begin + i]))
+ // (*f1)(v1->data[v1->begin + i]);
+ // Unrolled loop if vsize > 4
+ zvect_index vsize = p_vect_size(v1);
+ register zvect_index i = 0;
+ if (vsize > 4) {
+ // Nice, we can do some unrolled apply:
+ for (i = 0; i <= (vsize - 4); i+=4)
+ {
+#if (COMPILER == COMPILER_GCC)
+ #pragma GCC unroll 4
+ for (zvect_index x=i; x < i + 4; x++)
+ if ((*f2)(v1->data[v1->begin + x], v2->data[v2->begin + x]))
+ (*f1)(v1->data[v1->begin + x]);
+#else
+ if ((*f2)(v1->data[v1->begin + i], v2->data[v2->begin + i]))
+ (*f1)(v1->data[v1->begin + i]);
+
+ if ((*f2)(v1->data[v1->begin + (i+1)], v2->data[v2->begin + (i+1)])
+ (*f1)(v1->data[v1->begin + (i+1)]);
+
+ if ((*f2)(v1->data[v1->begin + (i+2)], v2->data[v2->begin + (i+2)])
+ (*f1)(v1->data[v1->begin + (i+2)]);
+
+ if ((*f2)(v1->data[v1->begin + (i+3)], v2->data[v2->begin + (i+3)])
+ (*f1)(v1->data[v1->begin + (i+3)]);
+#endif
+ }
+ }
+ // Cleanup loop for remaining elements if any
+ for (; i < vsize; i++)
if ((*f2)(v1->data[v1->begin + i], v2->data[v2->begin + i]))
(*f1)(v1->data[v1->begin + i]);