Commit 5470f39d authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.03 (Speed 8.3)

parent 5ebcaffc
......@@ -27,41 +27,40 @@ public:
class levenstein_base {
protected:
levenstein_base(std::size_t a_size, std::size_t b_size, size_t width) :
A_SIZE(a_size), B_SIZE(b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(a_size + 1)), VEC_CNT(VEC_SIZE / WIDTH), MIN_DST(std::abs((int64_t)a_size - (int64_t)b_size)), MAX_DST(std::min(a_size, b_size) + MIN_DST) {
zeroth_row = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
assert(zeroth_row);
for (size_t i = 0; i < VEC_SIZE; ++i) zeroth_row[i] = i;
levenstein_base(std::size_t a_size, std::size_t b_size, size_t word) :
A_SIZE(a_size), B_SIZE(b_size), WORD(word), WIDTH(calc_width(A_SIZE)), HEIGHT(B_SIZE + 1) {
const size_t size = sizeof(uint32_t) * WORD * HEIGHT;
buffer1 = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
init_buff = (uint32_t*)aligned_alloc(WORD, size);
assert(init_buff);
for (size_t i = 0; i < HEIGHT; ++i) init_buff[(i + 1) * word - 1] = i;
buffer1 = (uint32_t*)aligned_alloc(WORD, size);
assert(buffer1);
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
buffer2 = (uint32_t*)aligned_alloc(WORD, size);
assert(buffer2);
}
~levenstein_base() {
free(init_buff);
free(buffer1);
free(buffer2);
free(zeroth_row);
}
protected:
const size_t A_SIZE, B_SIZE;
const size_t WORD;
const size_t WIDTH;
const size_t ALIGNMENT;
const size_t VEC_SIZE;
const size_t VEC_CNT;
const size_t MIN_DST;
const size_t MAX_DST;
const size_t HEIGHT;
uint32_t* init_buff;
uint32_t* buffer1;
uint32_t* buffer2;
uint32_t* zeroth_row;
private:
size_t vec_size(size_t element_cnt) {
return WIDTH * (element_cnt / WIDTH + element_cnt % WIDTH);
size_t calc_width(size_t element_cnt) {
return WORD * ((element_cnt / WORD) + (element_cnt % WORD));
}
};
......@@ -71,6 +70,7 @@ struct policy_avx512 {};
//#define DEBUG_MODE
#if 0
struct policy_scalar {};
template <>
......@@ -122,7 +122,8 @@ public:
return first;
}
#endif
};
}; // namespace levensol
#endif
#ifdef DEBUG_MODE
......@@ -170,8 +171,46 @@ public:
//levenstein<policy_scalar> lev_scal(A_SIZE, B_SIZE);
//return lev_scal.compute(a, b);
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
uint32_t *prev_vec = init_buff, *curr_vec = buffer2;
for (size_t j = 0; j < WIDTH; ++j) {
const size_t base = j * WORD;
__m128i prev = _mm_setr_epi32(base + 1, base + 2, base + 3, base + 4);
__m128i curra = _mm_loadu_si128((__m128i*)(a + (WORD * j)));
for (size_t i = 1; i < HEIGHT; ++i) {
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = _mm_slli_si128(prev, 4);
prev_shr = _mm_insert_epi32(prev_shr, prev_vec[(i + 1) * WORD - 1], 0);
__m128i currb = _mm_set1_epi32(b[i]);
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
_mm_store_si128((__m128i*)(curr_vec + (WORD * i)), curr);
uint32_t prev_val = prev_vec[(i + 1) * WORD - 1];
for (size_t k = 0; k < WORD; ++k) {
prev_val += 1;
size_t c_index = (WORD * i) + k;
uint32_t curr_val = curr_vec[c_index];
if (prev_val < curr_val) {
curr_vec[c_index] = prev_val;
} else
prev_val = curr_val;
}
}
std::swap(prev_vec, curr_vec);
if (!j) curr_vec = buffer1;
}
return prev_vec[(A_SIZE - 1) % WORD];
#if 0
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_CNT; ++j) {
......@@ -210,6 +249,7 @@ public:
}
return prev_vec[A_SIZE];
#endif
}
private:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment