Commit f307e683 authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 2.01 (Speed: 3.12)

parent 77671c51
#include "levensol.hpp"
namespace levensol {
#if 0
template <>
std::uint32_t levenstein<policy_avx>::compute(const std::uint32_t* a, const std::uint32_t* b) {
//for (size_t i = 0; i < a_size + 1; ++i) prev.push_back(i);
}
#endif
} // namespace levensol
#ifndef levensol_hpp_
#define levensol_hpp_
#include <cstdint>
#include <immintrin.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <vector>
namespace levensol {
template< typename policy>
class levenstein {
public:
levenstein(std::size_t a_size, std::size_t b_size)
{}
template <typename policy>
class levenstein {
public:
levenstein(std::size_t a_size, std::size_t b_size) {
}
~levenstein() {
}
//std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
//}
};
class levenstein_base {
protected:
levenstein_base(std::size_t a_size, std::size_t b_size, size_t width) :
A_SIZE(a_size), B_SIZE(b_size), WIDTH(width), ALIGNMENT(sizeof(uint32_t) * WIDTH), VEC_SIZE(vec_size(a_size + 1)), VEC_CNT(VEC_SIZE / WIDTH) {
zeroth_row = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
assert(zeroth_row);
for (size_t i = 0; i < VEC_SIZE; ++i) zeroth_row[i] = i;
buffer1 = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
assert(buffer1);
buffer2 = (uint32_t*)aligned_alloc(ALIGNMENT, VEC_SIZE * sizeof(uint32_t));
assert(buffer2);
}
~levenstein_base() {
free(buffer1);
free(buffer2);
free(zeroth_row);
}
protected:
const size_t A_SIZE, B_SIZE;
const size_t WIDTH;
const size_t ALIGNMENT;
const size_t VEC_SIZE;
const size_t VEC_CNT;
uint32_t* buffer1;
uint32_t* buffer2;
uint32_t* zeroth_row;
private:
size_t vec_size(size_t element_cnt) {
return WIDTH * (element_cnt / WIDTH + element_cnt % WIDTH);
}
};
struct policy_sse {};
struct policy_avx {};
struct policy_avx512 {};
//#define DEBUG_MODE
#ifdef DEBUG_MODE
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b)
{
return 0;
struct policy_scalar {};
template <>
class levenstein<policy_scalar> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 1) {
for (size_t i = 0; i < VEC_SIZE; ++i) buffer1[i] = zeroth_row[i];
}
#if 0
const uint32_t* compute_next_row(size_t row, const std::uint32_t* a, const std::uint32_t* b) {
buffer2[0] = row + 1;
for (size_t j = 1; j < A_SIZE + 1; ++j) {
buffer2[j] = std::min(std::min(buffer1[j], buffer2[j - 1]) + 1, buffer1[j - 1] + (a[j - 1] != b[row]));
}
std::swap(buffer1, buffer2);
return buffer1;
}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
for (size_t i = 0; i < B_SIZE; ++i) {
compute_next_row(i, a, b);
}
};
return buffer1[A_SIZE];
}
#else
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
for (size_t i = 0; i < B_SIZE; ++i) {
buffer2[0] = i + 1;
for (size_t j = 1; j < A_SIZE + 1; ++j) {
buffer2[j] = std::min(std::min(buffer1[j], buffer2[j - 1]) + 1, buffer1[j - 1] + (a[j - 1] != b[i]));
}
std::swap(buffer1, buffer2);
}
return buffer1[A_SIZE];
}
#endif
};
struct policy_sse {
};
inline uint32_t comp(size_t a_size, size_t b_size, const uint32_t* a, const uint32_t* b) {
levenstein<policy_scalar> l(a_size, b_size);
return l.compute(a, b);
}
struct policy_avx {
};
inline void prt_diff(size_t len, const uint32_t* a, const uint32_t* b) {
std::cout << "DIFF:";
for (size_t i = 0; i < len; ++i) {
if (a[i] == b[i]) std::cout << " " << a[i];
else
std::cout << " \e[33m" << a[i] << ":" << b[i] << "\e[00m";
}
std::cout << std::endl;
}
struct policy_avx512 {
};
inline void prt_arr(size_t len, const uint32_t* array, const char* title = nullptr) {
if (title) std::cout << title << ": ";
for (size_t k = 0; k < len + 1; ++k) std::cout << array[k] << " ";
std::cout << std::endl;
}
template <typename U>
inline void prt_vec(U vec, const char* desc) {
constexpr size_t SZ = sizeof(U) / sizeof(uint32_t);
uint32_t prt_arr[SZ];
_mm_store_si128((__m128i*)(prt_arr), vec);
std::cout << desc << ": ";
for (size_t k = 0; k < SZ; ++k) std::cout << prt_arr[k] << ' ';
std::cout << std::endl;
}
#endif
template <>
class levenstein<policy_sse> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 4) {}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
uint32_t *prev_vec = zeroth_row, *curr_vec = buffer2;
#ifdef DEBUG_MODE
levenstein<policy_scalar> lev_scal(A_SIZE, B_SIZE);
if (A_SIZE != 128) {
return 0;
}
std::cout << "--------------------------------------" << std::endl;
prt_arr(A_SIZE, a, "A");
prt_arr(B_SIZE, b, "B");
prt_arr(A_SIZE, prev_vec, "INIT_PREV");
#endif
for (size_t i = 0; i < B_SIZE; ++i) {
__m128i currb = _mm_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_CNT; ++j) {
__m128i prev = _mm_load_si128((__m128i*)(prev_vec + (WIDTH * j)));
__m128i curr = _mm_add_epi32(prev, ones);
__m128i prev_shr = _mm_slli_si128(prev, 4);
prev_shr = _mm_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
__m128i curra = _mm_loadu_si128((__m128i*)(a + (WIDTH * j - 1)));
__m128i mask = _mm_cmpeq_epi32(curra, currb);
__m128i ones_masked = _mm_and_si128(ones, ~mask);
prev_shr = _mm_add_epi32(prev_shr, ones_masked);
curr = _mm_min_epu32(curr, prev_shr);
_mm_store_si128((__m128i*)(curr_vec + (WIDTH * j)), curr);
}
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = std::min(curr_vec[j], curr_vec[j - 1] + 1);
if (!i) prev_vec = buffer1;
std::swap(prev_vec, curr_vec);
#ifdef DEBUG_MODE
auto correct = lev_scal.compute_next_row(i, a, b);
prt_diff(A_SIZE + 1, prev_vec, correct);
#endif
}
#ifdef DEBUG_MODE
for (size_t k = 0; k < A_SIZE + 1; ++k) std::cout << prev_vec[k] << " ";
std::cout << std::endl;
#endif
return prev_vec[A_SIZE];
}
private:
const __m128i ones = _mm_set1_epi32(1);
};
// FIXME: Remove 1 in condition
#if USE_AVX && 0 //|| 1
template <>
class levenstein<policy_avx> : levenstein_base {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein_base(a_size, b_size, 8) {}
std::uint32_t compute(const std::uint32_t* a, const std::uint32_t* b) {
#ifdef DEBUG_MODE
return comp(A_SIZE, B_SIZE, a, b);
#endif
const __m256i ones = _mm256_set1_epi32(1);
for (size_t i = 0; i < B_SIZE; ++i) {
__m256i currb = _mm256_set1_epi32(b[i]);
for (size_t j = 0; j < VEC_SIZE / WIDTH; ++j) {
__m256i prev = _mm256_load_si256((__m256i*)(prev_vec + (WIDTH * j)));
__m256i curr = _mm256_add_epi32(prev, ones);
__m256i prev_shr = _mm256_slli_si256(prev, 4);
prev_shr = _mm256_insert_epi32(prev_shr, j ? prev_vec[WIDTH * j - 1] : (i + 1), 0);
prev_shr = _mm256_insert_epi32(prev_shr, _mm256_extract_epi32(prev, 3), 4);
__m256i curra = _mm256_loadu_si256((__m256i*)(a + (WIDTH * j - 1)));
__m256i mask = _mm256_cmpeq_epi32(curra, currb);
prev_shr = _mm256_and_si256(prev_shr, mask);
curr = _mm256_min_epi32(curr, prev_shr);
_mm256_store_si256((__m256i*)(curr_vec + (WIDTH * j)), curr);
}
for (size_t j = 1; j < A_SIZE + 1; ++j) curr_vec[j] = curr_vec[j - 1];
std::swap(prev_vec, curr_vec);
}
return prev_vec[A_SIZE];
}
};
#else
template <>
class levenstein<policy_avx> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
#endif
#if USE_AVX512
template <>
class levenstein<policy_avx512> : public levenstein<policy_sse> {
public:
levenstein(std::size_t a_size, std::size_t b_size) :
levenstein<policy_sse>(a_size, b_size){};
};
#endif
} // namespace levensol
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment