Commit f2d2fbaf authored by Dubský Jan's avatar Dubský Jan
Browse files

Solution 4.05 (Speed: 1.4)

parent c365be23
......@@ -74,7 +74,7 @@ private:
break;
#define gen_asgn(pos, wpos) \
writes##pos = _mm_insert_epi16(writes##pos, _mm_extract_epi16(row, pos), wpos)
writes[pos] = _mm_insert_epi16(writes[pos], _mm_extract_epi16(row, pos), wpos)
#define gen_for(wpos) \
row = _mm_load_si128((__m128i*)&b.addr(i * VEC_CNT + wpos, j_block * VEC_CNT)); \
......@@ -103,7 +103,7 @@ public:
const size_t B_COPY_SIZE = (a.HS + VEC_CNT - 1) / VEC_CNT * VEC_CNT;
matrix_element b_copy[VEC_CNT][B_COPY_SIZE];
__m128i writes0, writes1, writes2, writes3, writes4, writes5, writes6, writes7;
__m128i writes[VEC_CNT];
for (size_t i = 0; i < B_COPY_SIZE / VEC_CNT; ++i) {
__m128i row;
gen_for(0);
......@@ -114,15 +114,9 @@ public:
gen_for(5);
gen_for(6);
gen_for(7);
_mm_store_si128((__m128i*)&b_copy[0][i * VEC_CNT], writes0);
_mm_store_si128((__m128i*)&b_copy[1][i * VEC_CNT], writes1);
_mm_store_si128((__m128i*)&b_copy[2][i * VEC_CNT], writes2);
_mm_store_si128((__m128i*)&b_copy[3][i * VEC_CNT], writes3);
_mm_store_si128((__m128i*)&b_copy[4][i * VEC_CNT], writes4);
_mm_store_si128((__m128i*)&b_copy[5][i * VEC_CNT], writes5);
_mm_store_si128((__m128i*)&b_copy[6][i * VEC_CNT], writes6);
_mm_store_si128((__m128i*)&b_copy[7][i * VEC_CNT], writes7);
for (size_t k = 0; k < VEC_CNT; ++k) {
_mm_store_si128((__m128i*)&b_copy[k][i * VEC_CNT], writes[k]);
}
}
for (size_t j = 0; j < VEC_CNT; ++j) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment