Commit c365be23 authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 4.05.1 (Speed: 1.39)

parent eddfbce7
......@@ -68,6 +68,25 @@ private:
}
};
#define gen_switch_label(res, mn, pos) \
case pos: \
res = _mm_insert_epi16(res, _mm_extract_epi16(mn, 0), pos); \
break;
#define gen_asgn(pos, wpos) \
writes##pos = _mm_insert_epi16(writes##pos, _mm_extract_epi16(row, pos), wpos)
#define gen_for(wpos) \
row = _mm_load_si128((__m128i*)&b.addr(i * VEC_CNT + wpos, j_block * VEC_CNT)); \
gen_asgn(0, wpos); \
gen_asgn(1, wpos); \
gen_asgn(2, wpos); \
gen_asgn(3, wpos); \
gen_asgn(4, wpos); \
gen_asgn(5, wpos); \
gen_asgn(6, wpos); \
gen_asgn(7, wpos);
template <typename policy>
class matrix : public matrix_base {
public:
......@@ -79,7 +98,36 @@ public:
assert(b.HS == HS);
assert(a.HS == b.VS);
for (size_t j = 0; j < HS; ++j) {
const size_t J_BLOCK_LIMIT = (HS + VEC_CNT - 1) / VEC_CNT;
for (size_t j_block = 0; j_block < J_BLOCK_LIMIT; ++j_block) {
const size_t B_COPY_SIZE = (a.HS + VEC_CNT - 1) / VEC_CNT * VEC_CNT;
matrix_element b_copy[VEC_CNT][B_COPY_SIZE];
__m128i writes0, writes1, writes2, writes3, writes4, writes5, writes6, writes7;
for (size_t i = 0; i < B_COPY_SIZE / VEC_CNT; ++i) {
__m128i row;
gen_for(0);
gen_for(1);
gen_for(2);
gen_for(3);
gen_for(4);
gen_for(5);
gen_for(6);
gen_for(7);
_mm_store_si128((__m128i*)&b_copy[0][i * VEC_CNT], writes0);
_mm_store_si128((__m128i*)&b_copy[1][i * VEC_CNT], writes1);
_mm_store_si128((__m128i*)&b_copy[2][i * VEC_CNT], writes2);
_mm_store_si128((__m128i*)&b_copy[3][i * VEC_CNT], writes3);
_mm_store_si128((__m128i*)&b_copy[4][i * VEC_CNT], writes4);
_mm_store_si128((__m128i*)&b_copy[5][i * VEC_CNT], writes5);
_mm_store_si128((__m128i*)&b_copy[6][i * VEC_CNT], writes6);
_mm_store_si128((__m128i*)&b_copy[7][i * VEC_CNT], writes7);
}
for (size_t j = 0; j < VEC_CNT; ++j) {
size_t j_abs = j + (VEC_CNT * j_block);
#if 0
// This is useless - compiles vectorizes normal loop here
const size_t B_COPY_SIZE = (a.HS + VEC_CNT - 1) / VEC_CNT * VEC_CNT;
matrix_element b_copy[B_COPY_SIZE];
......@@ -88,21 +136,49 @@ public:
__m128i vec = _mm_setr_epi16(b.addr(k + 0, j), b.addr(k + 1, j), b.addr(k + 2, j), b.addr(k + 3, j), b.addr(k + 4, j), b.addr(k + 5, j), b.addr(k + 6, j), b.addr(k + 7, j));
_mm_storeu_si128((__m128i*)&b_copy[i * VEC_CNT], vec);
}
#else
//for (size_t i = 0; i < B_COPY_SIZE; ++i) {
// b_copy[j][i] = b.addr(i, j_abs);
//}
#endif
for (size_t i = 0; i < VS; ++i) {
const matrix_element res_init_val = std::numeric_limits<matrix_element>::max();
__m128i res_vec = _mm_set1_epi16(res_init_val);
for (size_t i = 0; i < VS; ++i) {
const matrix_element res_init_val = std::numeric_limits<matrix_element>::max();
__m128i res_vec = _mm_set1_epi16(res_init_val);
size_t K_LIMIT = a.HS / VEC_CNT;
for (size_t k = 0; k < K_LIMIT; ++k) {
__m128i val_a = _mm_load_si128((__m128i*)&a.addr(i, k * VEC_CNT));
__m128i val_b = _mm_load_si128((__m128i*)&b_copy[j][k * VEC_CNT]);
__m128i val = _mm_add_epi16(val_a, val_b);
res_vec = _mm_min_epu16(res_vec, val);
}
__m128i mn = _mm_minpos_epu16(res_vec);
#if 0
const size_t iter = j % VEC_CNT;
switch (iter) {
gen_switch_label(write_vec, mn, 0);
gen_switch_label(write_vec, mn, 1);
gen_switch_label(write_vec, mn, 2);
gen_switch_label(write_vec, mn, 3);
gen_switch_label(write_vec, mn, 4);
gen_switch_label(write_vec, mn, 5);
gen_switch_label(write_vec, mn, 6);
gen_switch_label(write_vec, mn, 7);
default:
assert(false);
break;
}
size_t K_LIMIT = a.HS / VEC_CNT;
for (size_t k = 0; k < K_LIMIT; ++k) {
__m128i val_a = _mm_load_si128((__m128i*)&a.addr(i, k * VEC_CNT));
__m128i val_b = _mm_load_si128((__m128i*)&b_copy[k * VEC_CNT]);
__m128i val = _mm_add_epi16(val_a, val_b);
res_vec = _mm_min_epu16(res_vec, val);
if ((iter + 1) == VEC_CNT) {
_mm_store_si128((__m128i*)&addr(i, j - iter), write_vec);
}
#endif
__m128i mn = _mm_minpos_epu16(res_vec);
addr(i, j) = _mm_extract_epi16(mn, 0);
addr(i, j_abs) = _mm_extract_epi16(mn, 0);
}
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment