Commit 15c7f11e authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.07 (Speed: 1.06)

parent 46e69145
......@@ -28,7 +28,7 @@ public:
class levenstein_base {
protected:
levenstein_base(std::size_t a_size, std::size_t b_size, size_t width) :
A_SIZE(a_size), B_SIZE(b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(a_size + 1)), VEC_CNT(VEC_SIZE / WIDTH) {
SWAP(a_size > b_size), A_SIZE(SWAP ? b_size : a_size), B_SIZE(SWAP ? a_size : b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(A_SIZE + 1)), VEC_CNT(VEC_SIZE / WIDTH) {
const size_t size = VEC_SIZE * sizeof(uint32_t) + CACHE_WIDTH;
zeroth_row = (uint32_t*)aligned_alloc(ALIGNMENT, size);
......@@ -41,23 +41,23 @@ protected:
assert(buffer1);
buffer1 += CACHE_PAD;
if (0) buffer2 = buffer1;
else {
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, size);
assert(buffer2);
buffer2 += CACHE_PAD;
}
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, size);
assert(buffer2);
buffer2 += CACHE_PAD;
}
~levenstein_base() {
return;
free(buffer1 - CACHE_PAD);
if (buffer1 != buffer2) free(buffer2 - CACHE_PAD);
free(buffer2 - CACHE_PAD);
free(zeroth_row - CACHE_PAD);
}
protected:
static const size_t CACHE_WIDTH = 64;
static const size_t CACHE_PAD = CACHE_WIDTH / sizeof(uint32_t);
const bool SWAP;
const size_t A_SIZE, B_SIZE;
const size_t WIDTH;
const size_t ALIGNMENT;
......@@ -136,9 +136,7 @@ inline void prt_vec(U vec, const char* desc) {
uint32_t prt_arr[SZ];
_mm_store_si128((__m128i*)(prt_arr), vec);
std::cout << desc << ": ";
for (size_t k = 0; k < SZ; ++k) std::cout << prt_arr[k] << ' ';
std::cout << std::endl;
}
return;
#endif
......@@ -151,43 +149,45 @@ inline void prt_vec(U vec, const char* desc) {
prev_val = curr_val; \
}
template <>
class levenstein<policy_sse> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 4) {}
//std::uint32_t compute(const std::uint32_t* __restrict__ a, const std::uint32_t* __restrict__ b) {
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
curr_vec[-1] = i + 2;
uint32_t prev_val = i;
__m128i prev_shr = _mm_loadu_si128((__m128i*)(prev_vec - 1));
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
//__m128i prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * j) - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * j - 1)));
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * (j + 1)) - 1));
template <>
class levenstein<policy_sse> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 4) {}
//std::uint32_t compute(const std::uint32_t* __restrict__ a, const std::uint32_t* __restrict__ b) {
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
if (SWAP) std::swap(a, b);
//__m128i *prev_vec = (__m128i*)zeroth_row, *curr_vec = (__m128i*)buffer2;
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
curr_vec[-1] = i + 2;
uint32_t prev_val = i;
__m128i next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a - 1));
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = next_prev_shr;
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * (j + 1)) - 1));
curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * (j + 1) - 1)));
#if 1
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
#endif
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
}
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
}
#if 0
#if 0
......@@ -200,72 +200,72 @@ public:
prev = curr;
}
#else
uint32_t prev_val = curr_vec[0] - 1;
for (size_t j = 0; j < VEC_CNT; ++j) {
auto addr = (__m128i*)(curr_vec + j * WIDTH);
__m128i curr = _mm_load_si128(addr);
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128(addr, curr);
}
uint32_t prev_val = curr_vec[0] - 1;
for (size_t j = 0; j < VEC_CNT; ++j) {
auto addr = (__m128i*)(curr_vec + j * WIDTH);
__m128i curr = _mm_load_si128(addr);
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128(addr, curr);
}
#endif
#endif
std::swap(prev_vec, curr_vec);
if (!i) curr_vec = buffer1;
//if (!i) prev_vec = buffer1;
}
std::swap(prev_vec, curr_vec);
if (!i) curr_vec = buffer1;
//if (!i) prev_vec = buffer1;
}
return prev_vec[A_SIZE];
}
return prev_vec[A_SIZE];
}
private:
const __m128i ones = _mm_set1_epi32(1);
};
private:
const __m128i ones = _mm_set1_epi32(1);
};
// FIXME: Remove 0 in condition
#if USE_AVX && 0
template <>
class levenstein<policy_avx> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 8) {}
template <>
class levenstein<policy_avx> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 8) {}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
#ifdef DEBUG_MODE
return comp(A_SIZE, B_SIZE, a, b);
return comp(A_SIZE, B_SIZE, a, b);
#endif
const __m256i ones = _mm256_set1_epi32(1);
for (size_t i = 0; i < B_SIZE; ++i) {
__m256i currb = _mm256_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_SIZE / WIDTH; ++j) {
__m256i prev = _mm256_load_si256((__m256i*)(prev_vec + (WIDTH * j)));
__m256i curr = _mm256_add_epi32(prev, ones);
const __m256i ones = _mm256_set1_epi32(1);
for (size_t i = 0; i < B_SIZE; ++i) {
__m256i currb = _mm256_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_SIZE / WIDTH; ++j) {
__m256i prev = _mm256_load_si256((__m256i*)(prev_vec + (WIDTH * j)));
__m256i curr = _mm256_add_epi32(prev, ones);
__m256i prev_shr = _mm256_slli_si256(prev, 4);
prev_shr = _mm256_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
prev_shr = _mm256_insert_epi32(prev_shr, _mm256_extract_epi32(prev, 3), 4);
__m256i prev_shr = _mm256_slli_si256(prev, 4);
prev_shr = _mm256_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
prev_shr = _mm256_insert_epi32(prev_shr, _mm256_extract_epi32(prev, 3), 4);
__m256i curra = _mm256_loadu_si256((__m256i*)(a + (WIDTH * j - 1)));
__m256i mask = _mm256_cmpeq_epi32(curra, currb);
prev_shr = _mm256_and_si256(prev_shr, mask);
curr = _mm256_min_epi32(curr, prev_shr);
__m256i curra = _mm256_loadu_si256((__m256i*)(a + (WIDTH * j - 1)));
__m256i mask = _mm256_cmpeq_epi32(curra, currb);
prev_shr = _mm256_and_si256(prev_shr, mask);
curr = _mm256_min_epi32(curr, prev_shr);
_mm256_store_si256((__m256i*)(curr_vec + (WIDTH * j)), curr);
}
_mm256_store_si256((__m256i*)(curr_vec + (WIDTH * j)), curr);
}
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = curr_vec[j - 1];
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = curr_vec[j - 1];
std::swap(prev_vec, curr_vec);
}
std::swap(prev_vec, curr_vec);
}
return prev_vec[A_SIZE];
}
};
return prev_vec[A_SIZE];
}
};
#else
......@@ -280,12 +280,12 @@ public:
#if USE_AVX512
template <>
class levenstein<policy_avx512> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
template <>
class levenstein<policy_avx512> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
#endif
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment