Commit afbc226a authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.07.1 (Speed: 1.06)

parent 15c7f11e
......@@ -106,40 +106,6 @@ public:
}
};
//#define DEBUG_MODE
#ifdef DEBUG_MODE
inline uint32_t comp(size_t a_size, size_t b_size, const uint32_t* a, const uint32_t* b) {
levenstein<policy_scalar> l(a_size, b_size);
return l.compute(a, b);
}
inline void prt_diff(size_t len, const uint32_t* a, const uint32_t* b) {
std::cout << "DIFF:";
for (size_t i = 0; i < len; ++i) {
if (a[i] == b[i]) std::cout << " " << a[i];
else
std::cout << " \e[33m" << a[i] << ":" << b[i] << "\e[00m";
}
std::cout << std::endl;
}
inline void prt_arr(size_t len, const uint32_t* array, const char* title = nullptr) {
if (title) std::cout << title << ": ";
for (size_t k = 0; k < len + 1; ++k) std::cout << array[k] << " ";
std::cout << std::endl;
}
template <typename U>
inline void prt_vec(U vec, const char* desc) {
constexpr size_t SZ = sizeof(U) / sizeof(uint32_t);
uint32_t prt_arr[SZ];
_mm_store_si128((__m128i*)(prt_arr), vec);
std::cout << desc << ": ";
return;
#endif
#define gen_min_loop_128(k) \
{ \
prev_val += 1; \
......@@ -149,123 +115,94 @@ inline void prt_vec(U vec, const char* desc) {
prev_val = curr_val; \
}
template <>
class levenstein<policy_sse> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 4) {}
//std::uint32_t compute(const std::uint32_t* __restrict__ a, const std::uint32_t* __restrict__ b) {
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
if (SWAP) std::swap(a, b);
//__m128i *prev_vec = (__m128i*)zeroth_row, *curr_vec = (__m128i*)buffer2;
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
curr_vec[-1] = i + 2;
uint32_t prev_val = i;
__m128i next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a - 1));
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = next_prev_shr;
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * (j + 1)) - 1));
curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * (j + 1) - 1)));
#if 1
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
#endif
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
}
#if 0
#if 0
uint32_t prev = curr_vec[0];
for (size_t j = 1; j < A_SIZE + 1; ++j) {
prev += 1;
uint32_t curr = curr_vec[j];
if (prev < curr) curr_vec[j] = prev;
else
prev = curr;
}
#else
uint32_t prev_val = curr_vec[0] - 1;
for (size_t j = 0; j < VEC_CNT; ++j) {
auto addr = (__m128i*)(curr_vec + j * WIDTH);
__m128i curr = _mm_load_si128(addr);
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128(addr, curr);
}
#endif
#endif
template <>
class levenstein<policy_sse> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 4) {}
std::swap(prev_vec, curr_vec);
if (!i) curr_vec = buffer1;
//if (!i) prev_vec = buffer1;
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
if (SWAP) std::swap(a, b);
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
curr_vec[-1] = i + 2;
uint32_t prev_val = i;
__m128i next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec - 1));
__m128i curra = _mm_loadu_si128((__m128i*)(a - 1));
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = next_prev_shr;
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
next_prev_shr = _mm_loadu_si128((__m128i*)(prev_vec + (WIDTH * (j + 1)) - 1));
curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * (j + 1) - 1)));
gen_min_loop_128(0);
gen_min_loop_128(1);
gen_min_loop_128(2);
gen_min_loop_128(3);
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
}
return prev_vec[A_SIZE];
std::swap(prev_vec, curr_vec);
if (!i) curr_vec = buffer1;
}
private:
const __m128i ones = _mm_set1_epi32(1);
};
return prev_vec[A_SIZE];
}
private:
const __m128i ones = _mm_set1_epi32(1);
};
// FIXME: Remove 0 in condition
#if USE_AVX && 0
template <>
class levenstein<policy_avx> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 8) {}
template <>
class levenstein<policy_avx> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 8) {}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
#ifdef DEBUG_MODE
return comp(A_SIZE, B_SIZE, a, b);
return comp(A_SIZE, B_SIZE, a, b);
#endif
const __m256i ones = _mm256_set1_epi32(1);
for (size_t i = 0; i < B_SIZE; ++i) {
__m256i currb = _mm256_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_SIZE / WIDTH; ++j) {
__m256i prev = _mm256_load_si256((__m256i*)(prev_vec + (WIDTH * j)));
__m256i curr = _mm256_add_epi32(prev, ones);
__m256i prev_shr = _mm256_slli_si256(prev, 4);
prev_shr = _mm256_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
prev_shr = _mm256_insert_epi32(prev_shr, _mm256_extract_epi32(prev, 3), 4);
__m256i curra = _mm256_loadu_si256((__m256i*)(a + (WIDTH * j - 1)));
__m256i mask = _mm256_cmpeq_epi32(curra, currb);
prev_shr = _mm256_and_si256(prev_shr, mask);
curr = _mm256_min_epi32(curr, prev_shr);
const __m256i ones = _mm256_set1_epi32(1);
for (size_t i = 0; i < B_SIZE; ++i) {
__m256i currb = _mm256_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_SIZE / WIDTH; ++j) {
__m256i prev = _mm256_load_si256((__m256i*)(prev_vec + (WIDTH * j)));
__m256i curr = _mm256_add_epi32(prev, ones);
_mm256_store_si256((__m256i*)(curr_vec + (WIDTH * j)), curr);
}
__m256i prev_shr = _mm256_slli_si256(prev, 4);
prev_shr = _mm256_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
prev_shr = _mm256_insert_epi32(prev_shr, _mm256_extract_epi32(prev, 3), 4);
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = curr_vec[j - 1];
__m256i curra = _mm256_loadu_si256((__m256i*)(a + (WIDTH * j - 1)));
__m256i mask = _mm256_cmpeq_epi32(curra, currb);
prev_shr = _mm256_and_si256(prev_shr, mask);
curr = _mm256_min_epi32(curr, prev_shr);
std::swap(prev_vec, curr_vec);
_mm256_store_si256((__m256i*)(curr_vec + (WIDTH * j)), curr);
}
return prev_vec[A_SIZE];
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = curr_vec[j - 1];
std::swap(prev_vec, curr_vec);
}
};
return prev_vec[A_SIZE];
}
};
#else
......@@ -280,12 +217,12 @@ public:
#if USE_AVX512
template <>
class levenstein<policy_avx512> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
template <>
class levenstein<policy_avx512> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
#endif
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment