Commit 6be4c384 authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.08.1 (Speed: 0.96)

parent 4441a98d
......@@ -12,13 +12,13 @@
namespace levensol {
template <typename policy>
class levenstein;
struct policy_sse {};
struct policy_avx {};
struct policy_avx512 {};
template <typename policy>
class levenstein;
class levenstein_base {
protected:
levenstein_base(std::size_t a_size, std::size_t b_size, size_t width) :
......@@ -42,6 +42,8 @@ protected:
for (size_t i = 0; i < VEC_SIZE; ++i) buffer1[i] = buffer2[i] = -1;
}
// Unfortunately, destructor is probably included in measured part of solution.
// Due to this fact, proper cleanup is costly, so I just drop pointers and hope there will be enough memory.
~levenstein_base() {
return;
free(buffer1 - CACHE_PAD);
......@@ -71,6 +73,7 @@ private:
}
};
// As part of testing, scalar levenstein was implemented too
#if 0
struct policy_scalar {};
......@@ -101,6 +104,7 @@ public:
};
#endif
// Wasn't able to find the way, how to force GCC to unrol loop with fixed range.
#define gen_min_loop_128(k) \
{ \
prev_val += 1; \
......@@ -172,68 +176,6 @@ public:
levenstein<policy_sse>(a_size, b_size){};
};
// FIXME: Remove 0 in condition
#if USE_AVX && 0
template <>
class levenstein<policy_avx> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 8) {}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
#ifdef DEBUG_MODE
return comp(A_SIZE, B_SIZE, a, b);
#endif
const __m256i ones = _mm256_set1_epi32(1);
for (size_t i = 0; i < B_SIZE; ++i) {
__m256i currb = _mm256_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_SIZE / WIDTH; ++j) {
__m256i prev = _mm256_load_si256((__m256i*)(prev_vec + (WIDTH * j)));
__m256i curr = _mm256_add_epi32(prev, ones);
__m256i prev_shr = _mm256_slli_si256(prev, 4);
prev_shr = _mm256_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
prev_shr = _mm256_insert_epi32(prev_shr, _mm256_extract_epi32(prev, 3), 4);
__m256i curra = _mm256_loadu_si256((__m256i*)(a + (WIDTH * j - 1)));
__m256i mask = _mm256_cmpeq_epi32(curra, currb);
prev_shr = _mm256_and_si256(prev_shr, mask);
curr = _mm256_min_epi32(curr, prev_shr);
_mm256_store_si256((__m256i*)(curr_vec + (WIDTH * j)), curr);
}
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = curr_vec[j - 1];
std::swap(prev_vec, curr_vec);
}
return prev_vec[A_SIZE];
}
};
#else
/*
template <>
class levenstein<policy_avx> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};*/
#endif
/*
#if USE_AVX512
template <>
class levenstein<policy_avx512> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
#endif*/
} // namespace levensol
#endif
#endif
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment