Commit 4441a98d authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.08 (Speed: 0.96)

parent afbc226a
......@@ -12,23 +12,17 @@
namespace levensol {
template <typename policy>
class levenstein {
public:
levenstein(std::size_t a_size, std::size_t b_size) {
}
~levenstein() {
}
struct policy_sse {};
struct policy_avx {};
struct policy_avx512 {};
//std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
//}
};
template <typename policy>
class levenstein;
class levenstein_base {
protected:
levenstein_base(std::size_t a_size, std::size_t b_size, size_t width) :
SWAP(a_size > b_size), A_SIZE(SWAP ? b_size : a_size), B_SIZE(SWAP ? a_size : b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(A_SIZE + 1)), VEC_CNT(VEC_SIZE / WIDTH) {
SWAP(a_size > b_size), A_SIZE(SWAP ? b_size : a_size), B_SIZE(SWAP ? a_size : b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(A_SIZE + 1)), VEC_CNT(VEC_SIZE / WIDTH), RANGE(A_SIZE / 2 / WIDTH) {
const size_t size = VEC_SIZE * sizeof(uint32_t) + CACHE_WIDTH;
zeroth_row = (uint32_t*)aligned_alloc(ALIGNMENT, size);
......@@ -44,6 +38,8 @@ protected:
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, size);
assert(buffer2);
buffer2 += CACHE_PAD;
for (size_t i = 0; i < VEC_SIZE; ++i) buffer1[i] = buffer2[i] = -1;
}
~levenstein_base() {
......@@ -63,6 +59,7 @@ protected:
const size_t ALIGNMENT;
const size_t VEC_SIZE;
const size_t VEC_CNT;
const size_t RANGE;
uint32_t* buffer1;
uint32_t* buffer2;
......@@ -74,10 +71,7 @@ private:
}
};
struct policy_sse {};
struct policy_avx {};
struct policy_avx512 {};
#if 0
struct policy_scalar {};
template <>
......@@ -105,6 +99,7 @@ public:
return buffer1[A_SIZE];
}
};
#endif
#define gen_min_loop_128(k) \
{ \
......@@ -126,13 +121,20 @@ public:
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
curr_vec[-1] = i + 2;
uint32_t prev_val = i;
__m128i next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a - 1));
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
const __m128i currb = _mm_set1_epi32(b[i]);
const size_t lower = (size_t)std::max<int64_t>(0, ((int64_t)i - (int64_t)B_SIZE + (int64_t)(RANGE * WIDTH))) / WIDTH;
const size_t upper = std::min(VEC_SIZE, i + (RANGE * WIDTH)) / WIDTH;
const size_t offset = lower * WIDTH;
curr_vec[offset - 1] = i + 2 + offset;
uint32_t prev_val = i + offset;
__m128i next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + offset - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a + offset - 1));
for (size_t j = lower; j < upper; ++j) {
const size_t k = WIDTH * j;
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + k));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = next_prev_shr;
......@@ -141,15 +143,15 @@ public:
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * (j + 1)) - 1));
curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * (j + 1) - 1)));
next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + k + WIDTH - 1));
curra = _mm_loadu_si128((__m128i*)(a + k + WIDTH - 1));
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
_mm_store_si128((__m128i*)(curr_vec + k), curr);
}
std::swap(prev_vec, curr_vec);
......@@ -163,6 +165,13 @@ private:
const __m128i ones = _mm_set1_epi32(1);
};
template <typename policy>
class levenstein : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
// FIXME: Remove 0 in condition
#if USE_AVX && 0
......@@ -205,16 +214,16 @@ public:
};
#else
/*
template <>
class levenstein<policy_avx> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
};*/
#endif
/*
#if USE_AVX512
template <>
......@@ -224,7 +233,7 @@ public:
levenstein<policy_sse>(a_size, b_size){};
};
#endif
#endif*/
} // namespace levensol
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment