Commit 46e69145 authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.06 (Speed: 1.22)

parent b503c68b
......@@ -35,19 +35,23 @@ protected:
assert(zeroth_row);
zeroth_row += CACHE_PAD;
for (size_t i = 0; i < VEC_SIZE; ++i) zeroth_row[i] = i;
zeroth_row[-1] = 1;
buffer1 = (uint32_t*)aligned_alloc(ALIGNMENT, size);
assert(buffer1);
buffer1 += CACHE_PAD;
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, size);
assert(buffer2);
buffer2 += CACHE_PAD;
if (0) buffer2 = buffer1;
else {
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, size);
assert(buffer2);
buffer2 += CACHE_PAD;
}
}
~levenstein_base() {
free(buffer1 - CACHE_PAD);
free(buffer2 - CACHE_PAD);
if (buffer1 != buffer2) free(buffer2 - CACHE_PAD);
free(zeroth_row - CACHE_PAD);
}
......@@ -138,13 +142,13 @@ inline void prt_vec(U vec, const char* desc) {
#endif
#define gen_min_loop_128(k) \
{ \
prev += 1; \
uint32_t curr = _mm_extract_epi32(tmp, k); \
if (prev < curr) tmp = _mm_insert_epi32(tmp, prev, k); \
else \
prev = curr; \
#define gen_min_loop_128(k) \
{ \
prev_val += 1; \
uint32_t curr_val = _mm_extract_epi32(curr, k); \
if (prev_val < curr_val) curr = _mm_insert_epi32(curr, prev_val, k); \
else \
prev_val = curr_val; \
}
template <>
......@@ -153,28 +157,39 @@ public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 4) {}
//std::uint32_t compute(const std::uint32_t* __restrict__ a, const std::uint32_t* __restrict__ b) {
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
prev_vec[-1] = i + 1;
__m128i currb = _mm_set1_epi32(b[i]);
curr_vec[-1] = i + 2;
uint32_t prev_val = i;
__m128i prev_shr = _mm_loadu_si128((__m128i*)(prev_vec - 1));
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * j) - 1));
//__m128i prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * j) - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * j - 1)));
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
curr = _mm_min_epu32(curr, prev_shr);
prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * (j + 1)) - 1));
#if 1
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
#endif
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
}
#if 0
#if 0
uint32_t prev = curr_vec[0];
for (size_t j = 1; j < A_SIZE + 1; ++j) {
......@@ -185,21 +200,23 @@ public:
prev = curr;
}
#else
uint32_t prev = curr_vec[0] - 1;
uint32_t prev_val = curr_vec[0] - 1;
for (size_t j = 0; j < VEC_CNT; ++j) {
auto addr = (__m128i*)(curr_vec + j * WIDTH);
__m128i tmp = _mm_load_si128(addr);
auto addr = (__m128i*)(curr_vec + j * WIDTH);
__m128i curr = _mm_load_si128(addr);
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128(addr, tmp);
_mm_store_si128(addr, curr);
}
#endif
#endif
std::swap(prev_vec, curr_vec);
if (!i) curr_vec = buffer1;
//if (!i) prev_vec = buffer1;
}
return prev_vec[A_SIZE];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment