Commit 8fc0cd6a authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.04 (Speed: 1.44)

parent 59393cea
......@@ -28,7 +28,7 @@ public:
class levenstein_base {
protected:
levenstein_base(std::size_t a_size, std::size_t b_size, size_t width) :
A_SIZE(a_size), B_SIZE(b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(a_size + 1)), VEC_CNT(VEC_SIZE / WIDTH), MIN_DST(std::abs((int64_t)a_size - (int64_t)b_size)), MAX_DST(std::min(a_size, b_size) + MIN_DST) {
A_SIZE(a_size), B_SIZE(b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(a_size + 1)), VEC_CNT(VEC_SIZE / WIDTH) {
zeroth_row = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
assert(zeroth_row);
for (size_t i = 0; i < VEC_SIZE; ++i) zeroth_row[i] = i;
......@@ -52,8 +52,6 @@ protected:
const size_t ALIGNMENT;
const size_t VEC_SIZE;
const size_t VEC_CNT;
const size_t MIN_DST;
const size_t MAX_DST;
uint32_t* buffer1;
uint32_t* buffer2;
......@@ -69,8 +67,6 @@ struct policy_sse {};
struct policy_avx {};
struct policy_avx512 {};
//#define DEBUG_MODE
struct policy_scalar {};
template <>
......@@ -80,10 +76,10 @@ public:
levenstein_base(a_size, b_size, 1) {
}
// Used for debugging
const uint32_t* compute_next_row(size_t row, const std::uint32_t* a, const std::uint32_t* b) {
buffer2[0] = row + 1;
for (size_t j = std::max(0ul, row - MAX_DST) + 1; j < std::min(A_SIZE, row + MAX_DST) + 1; ++j) {
//for (size_t j = 1; j < A_SIZE + 1; ++j) {
for (size_t j = 1; j < A_SIZE + 1; ++j) {
buffer2[j] = std::min(std::min(buffer1[j], buffer2[j - 1]) + 1, buffer1[j - 1] + (a[j - 1] != b[row]));
}
std::swap(buffer1, buffer2);
......@@ -97,35 +93,11 @@ public:
}
return buffer1[A_SIZE];
}
#if 0
std::uint32_t compute2(const std::uint32_t* a, const std::uint32_t* b) {
for (size_t i = 1; i < MAT_H; ++i) {
mat(i, 0) = i;
for (size_t j = 1; j < MAT_W; ++j) {
mat(i, j) = std::min(mat(i - 1, j) + 1, mat(i - 1, j - 1) + (a[j - 1] != b[i]));
}
}
for (size_t j = 1; j < MAT_W; ++j) {
for (size_t i = 1; i < MAT_H; ++i) {
mat(i, j) = std::min(mat(i, j), mat(i, j - 1) + 1);
}
}
return mat(B_SIZE, A_SIZE);
}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
uint32_t first = compute1(a, b);
uint32_t second = compute2(a, b);
std::cout << first << " : " << second << std::endl;
assert(first == second);
return first;
}
#endif
};
#ifdef DEBUG_MODE
//#define DEBUG_MODE
#ifdef DEBUG_MODE
inline uint32_t comp(size_t a_size, size_t b_size, const uint32_t* a, const uint32_t* b) {
levenstein<policy_scalar> l(a_size, b_size);
return l.compute(a, b);
......@@ -159,6 +131,15 @@ inline void prt_vec(U vec, const char* desc) {
#endif
#define gen_min_loop_128(k) \
{ \
prev += 1; \
uint32_t curr = _mm_extract_epi32(tmp, k); \
if (prev < curr) tmp = _mm_insert_epi32(tmp, prev, k); \
else \
prev = curr; \
}
template <>
class levenstein<policy_sse> : levenstein_base {
public:
......@@ -166,10 +147,6 @@ public:
levenstein_base(a_size, b_size, 4) {}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
// TODO: Fix this shit
//levenstein<policy_scalar> lev_scal(A_SIZE, B_SIZE);
//return lev_scal.compute(a, b);
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
......@@ -178,8 +155,12 @@ public:
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = _mm_slli_si128(prev, 4);
prev_shr = _mm_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
__m128i prev_shr;
if (j) prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * j) - 1));
else {
prev_shr = _mm_slli_si128(prev, 4);
prev_shr = _mm_insert_epi32(prev_shr, (i + 1), 0);
}
__m128i curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * j - 1)));
__m128i mask = _mm_cmpeq_epi32(curra, currb);
......@@ -189,24 +170,33 @@ public:
curr = _mm_min_epu32(curr, prev_shr);
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
/*for (size_t k = j ? 0 : 1; k < WIDTH; ++k) {
uint32_t p = curr_vec[(WIDTH * j) + k - 1], c = curr_vec[(WIDTH * j) + k];
if (p + 1 < c) curr_vec[(WIDTH * j) + k] = p + 1;
}*/
}
#if 0
uint32_t prev = curr_vec[0];
for (size_t j = 1; j < A_SIZE + 1; ++j) {
prev += 1;
uint32_t curr = curr_vec[j];
if (prev < curr) {
curr_vec[j] = prev;
} else
if (prev < curr) curr_vec[j] = prev;
else
prev = curr;
}
#else
uint32_t prev = curr_vec[0] - 1;
for (size_t j = 0; j < VEC_CNT; ++j) {
auto addr = (__m128i*)(curr_vec + j * WIDTH);
__m128i tmp = _mm_load_si128(addr);
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128(addr, tmp);
}
#endif
if (!i) prev_vec = buffer1;
std::swap(prev_vec, curr_vec);
if (!i) curr_vec = buffer1;
}
return prev_vec[A_SIZE];
......@@ -216,8 +206,8 @@ private:
const __m128i ones = _mm_set1_epi32(1);
};
// FIXME: Remove 1 in condition
#if USE_AVX && 0 //|| 1
// FIXME: Remove 0 in condition
#if USE_AVX && 0
template <>
class levenstein<policy_avx> : levenstein_base {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment