Commit 2082e397 authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 4.06 (Speed: 1.30)

parent 65c85403
......@@ -68,13 +68,8 @@ private:
}
};
#define gen_switch_label(res, mn, pos) \
case pos: \
res = _mm_insert_epi16(res, _mm_extract_epi16(mn, 0), pos); \
break;
#define gen_asgn(pos, wpos) \
writes##pos = _mm_insert_epi16(writes##pos, _mm_extract_epi16(row, pos), wpos)
row##pos = _mm_insert_epi16(row##pos, _mm_extract_epi16(row, pos), wpos)
#define gen_for(wpos) \
row = _mm_load_si128((__m128i*)&b.addr(i * VEC_CNT + wpos, j_block * VEC_CNT)); \
......@@ -98,12 +93,14 @@ public:
assert(b.HS == HS);
assert(a.HS == b.VS);
const __m128i MASK = _mm_setr_epi16(0xffff, 0, 0, 0, 0, 0, 0, 0);
const size_t J_BLOCK_LIMIT = (HS + VEC_CNT - 1) / VEC_CNT;
for (size_t j_block = 0; j_block < J_BLOCK_LIMIT; ++j_block) {
const size_t B_COPY_SIZE = (a.HS + VEC_CNT - 1) / VEC_CNT * VEC_CNT;
matrix_element b_copy[VEC_CNT][B_COPY_SIZE];
__m128i writes0, writes1, writes2, writes3, writes4, writes5, writes6, writes7;
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
for (size_t i = 0; i < B_COPY_SIZE / VEC_CNT; ++i) {
__m128i row;
gen_for(0);
......@@ -115,34 +112,22 @@ public:
gen_for(6);
gen_for(7);
_mm_store_si128((__m128i*)&b_copy[0][i * VEC_CNT], writes0);
_mm_store_si128((__m128i*)&b_copy[1][i * VEC_CNT], writes1);
_mm_store_si128((__m128i*)&b_copy[2][i * VEC_CNT], writes2);
_mm_store_si128((__m128i*)&b_copy[3][i * VEC_CNT], writes3);
_mm_store_si128((__m128i*)&b_copy[4][i * VEC_CNT], writes4);
_mm_store_si128((__m128i*)&b_copy[5][i * VEC_CNT], writes5);
_mm_store_si128((__m128i*)&b_copy[6][i * VEC_CNT], writes6);
_mm_store_si128((__m128i*)&b_copy[7][i * VEC_CNT], writes7);
_mm_store_si128((__m128i*)&b_copy[0][i * VEC_CNT], row0);
_mm_store_si128((__m128i*)&b_copy[1][i * VEC_CNT], row1);
_mm_store_si128((__m128i*)&b_copy[2][i * VEC_CNT], row2);
_mm_store_si128((__m128i*)&b_copy[3][i * VEC_CNT], row3);
_mm_store_si128((__m128i*)&b_copy[4][i * VEC_CNT], row4);
_mm_store_si128((__m128i*)&b_copy[5][i * VEC_CNT], row5);
_mm_store_si128((__m128i*)&b_copy[6][i * VEC_CNT], row6);
_mm_store_si128((__m128i*)&b_copy[7][i * VEC_CNT], row7);
}
for (size_t j = 0; j < VEC_CNT; ++j) {
size_t j_abs = j + (VEC_CNT * j_block);
#if 0
// This is useless - compiles vectorizes normal loop here
const size_t B_COPY_SIZE = (a.HS + VEC_CNT - 1) / VEC_CNT * VEC_CNT;
matrix_element b_copy[B_COPY_SIZE];
for (size_t i = 0; i < B_COPY_SIZE / VEC_CNT; ++i) {
const size_t k = i * VEC_CNT;
__m128i vec = _mm_setr_epi16(b.addr(k + 0, j), b.addr(k + 1, j), b.addr(k + 2, j), b.addr(k + 3, j), b.addr(k + 4, j), b.addr(k + 5, j), b.addr(k + 6, j), b.addr(k + 7, j));
_mm_storeu_si128((__m128i*)&b_copy[i * VEC_CNT], vec);
}
#else
//for (size_t i = 0; i < B_COPY_SIZE; ++i) {
// b_copy[j][i] = b.addr(i, j_abs);
//}
#endif
for (size_t i = 0; i < VS; ++i) {
//__m128i to_write = _mm_set1_epi16(0);
for (size_t j = 0; j < VEC_CNT; ++j) {
size_t j_abs = j + (VEC_CNT * j_block);
for (size_t i = 0; i < VS; ++i) {
const matrix_element res_init_val = std::numeric_limits<matrix_element>::max();
__m128i res_vec = _mm_set1_epi16(res_init_val);
......@@ -155,30 +140,35 @@ public:
}
__m128i mn = _mm_minpos_epu16(res_vec);
mn = _mm_and_si128(mn, MASK);
#if 0
const size_t iter = j % VEC_CNT;
switch (iter) {
gen_switch_label(write_vec, mn, 0);
gen_switch_label(write_vec, mn, 1);
gen_switch_label(write_vec, mn, 2);
gen_switch_label(write_vec, mn, 3);
gen_switch_label(write_vec, mn, 4);
gen_switch_label(write_vec, mn, 5);
gen_switch_label(write_vec, mn, 6);
gen_switch_label(write_vec, mn, 7);
default:
assert(false);
break;
}
if ((iter + 1) == VEC_CNT) {
_mm_store_si128((__m128i*)&addr(i, j - iter), write_vec);
}
#define get_to_write_assign_label(pos) \
case pos: \
mn = _mm_slli_si128(mn, pos * 2); \
break
switch (j) {
get_to_write_assign_label(0);
get_to_write_assign_label(1);
get_to_write_assign_label(2);
get_to_write_assign_label(3);
get_to_write_assign_label(4);
get_to_write_assign_label(5);
get_to_write_assign_label(6);
get_to_write_assign_label(7);
default:
assert(false);
break;
}
to_write = _mm_or_si128(to_write, mn);
#endif
addr(i, j_abs) = _mm_extract_epi16(mn, 0);
}
//_mm_store_si128((__m128i*)&addr(i, VEC_CNT * j_block), to_write);
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment