OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_sse2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include <climits>
39#include <cstdio>
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_params.h"
45#include "../codestream/ojph_params_local.h"
46
47#include "ojph_transform.h"
49
50#include <emmintrin.h>
51
52namespace ojph {
53 namespace local {
54
56 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
57 static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
58 {
59 // note than m must be obtained using
60 // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
61 __m128i x = _mm_srli_epi64(a, amt);
62 x = _mm_xor_si128(x, m);
63 __m128i result = _mm_sub_epi64(x, m);
64 return result;
65 }
66
68 static inline
69 void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width)
70 {
71 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
72 {
73 __m128 a = _mm_load_ps(sp);
74 __m128 b = _mm_load_ps(sp + 4);
75 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
76 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
77 _mm_store_ps(dpl, c);
78 _mm_store_ps(dph, d);
79 }
80 }
81
83 static inline
84 void sse2_interleave32(float* dp, float* spl, float* sph, int width) \
85 {
86 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
87 {
88 __m128 a = _mm_load_ps(spl);
89 __m128 b = _mm_load_ps(sph);
90 __m128 c = _mm_unpacklo_ps(a, b);
91 __m128 d = _mm_unpackhi_ps(a, b);
92 _mm_store_ps(dp, c);
93 _mm_store_ps(dp + 4, d);
94 }
95 }
96
98 static inline
99 void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width)
100 {
101 for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
102 {
103 __m128d a = _mm_load_pd(sp);
104 __m128d b = _mm_load_pd(sp + 2);
105 __m128d c = _mm_shuffle_pd(a, b, 0);
106 __m128d d = _mm_shuffle_pd(a, b, 3);
107 _mm_store_pd(dpl, c);
108 _mm_store_pd(dph, d);
109 }
110 }
111
113 static inline
114 void sse2_interleave64(double* dp, double* spl, double* sph, int width)
115 {
116 for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
117 {
118 __m128d a = _mm_load_pd(spl);
119 __m128d b = _mm_load_pd(sph);
120 __m128d c = _mm_unpacklo_pd(a, b);
121 __m128d d = _mm_unpackhi_pd(a, b);
122 _mm_store_pd(dp, c);
123 _mm_store_pd(dp + 2, d);
124 }
125 }
126
128 static
129 void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig,
130 const line_buf* other, const line_buf* aug,
131 ui32 repeat, bool synthesis)
132 {
133 const si32 a = s->rev.Aatk;
134 const si32 b = s->rev.Batk;
135 const ui8 e = s->rev.Eatk;
136 __m128i vb = _mm_set1_epi32(b);
137
138 si32* dst = aug->i32;
139 const si32* src1 = sig->i32, * src2 = other->i32;
140 // The general definition of the wavelet in Part 2 is slightly
141 // different to part 2, although they are mathematically equivalent
142 // here, we identify the simpler form from Part 1 and employ them
143 if (a == 1)
144 { // 5/3 update and any case with a == 1
145 int i = (int)repeat;
146 if (synthesis)
147 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
148 {
149 __m128i s1 = _mm_load_si128((__m128i*)src1);
150 __m128i s2 = _mm_load_si128((__m128i*)src2);
151 __m128i d = _mm_load_si128((__m128i*)dst);
152 __m128i t = _mm_add_epi32(s1, s2);
153 __m128i v = _mm_add_epi32(vb, t);
154 __m128i w = _mm_srai_epi32(v, e);
155 d = _mm_sub_epi32(d, w);
156 _mm_store_si128((__m128i*)dst, d);
157 }
158 else
159 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
160 {
161 __m128i s1 = _mm_load_si128((__m128i*)src1);
162 __m128i s2 = _mm_load_si128((__m128i*)src2);
163 __m128i d = _mm_load_si128((__m128i*)dst);
164 __m128i t = _mm_add_epi32(s1, s2);
165 __m128i v = _mm_add_epi32(vb, t);
166 __m128i w = _mm_srai_epi32(v, e);
167 d = _mm_add_epi32(d, w);
168 _mm_store_si128((__m128i*)dst, d);
169 }
170 }
171 else if (a == -1 && b == 1 && e == 1)
172 { // 5/3 predict
173 int i = (int)repeat;
174 if (synthesis)
175 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
176 {
177 __m128i s1 = _mm_load_si128((__m128i*)src1);
178 __m128i s2 = _mm_load_si128((__m128i*)src2);
179 __m128i d = _mm_load_si128((__m128i*)dst);
180 __m128i t = _mm_add_epi32(s1, s2);
181 __m128i w = _mm_srai_epi32(t, e);
182 d = _mm_add_epi32(d, w);
183 _mm_store_si128((__m128i*)dst, d);
184 }
185 else
186 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
187 {
188 __m128i s1 = _mm_load_si128((__m128i*)src1);
189 __m128i s2 = _mm_load_si128((__m128i*)src2);
190 __m128i d = _mm_load_si128((__m128i*)dst);
191 __m128i t = _mm_add_epi32(s1, s2);
192 __m128i w = _mm_srai_epi32(t, e);
193 d = _mm_sub_epi32(d, w);
194 _mm_store_si128((__m128i*)dst, d);
195 }
196 }
197 else if (a == -1)
198 { // any case with a == -1, which is not 5/3 predict
199 int i = (int)repeat;
200 if (synthesis)
201 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
202 {
203 __m128i s1 = _mm_load_si128((__m128i*)src1);
204 __m128i s2 = _mm_load_si128((__m128i*)src2);
205 __m128i d = _mm_load_si128((__m128i*)dst);
206 __m128i t = _mm_add_epi32(s1, s2);
207 __m128i v = _mm_sub_epi32(vb, t);
208 __m128i w = _mm_srai_epi32(v, e);
209 d = _mm_sub_epi32(d, w);
210 _mm_store_si128((__m128i*)dst, d);
211 }
212 else
213 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
214 {
215 __m128i s1 = _mm_load_si128((__m128i*)src1);
216 __m128i s2 = _mm_load_si128((__m128i*)src2);
217 __m128i d = _mm_load_si128((__m128i*)dst);
218 __m128i t = _mm_add_epi32(s1, s2);
219 __m128i v = _mm_sub_epi32(vb, t);
220 __m128i w = _mm_srai_epi32(v, e);
221 d = _mm_add_epi32(d, w);
222 _mm_store_si128((__m128i*)dst, d);
223 }
224 }
225 else { // general case
226 // 32bit multiplication is not supported in sse2; we need sse4.1,
227 // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
228 // keeping the LSBs
229 if (synthesis)
230 for (ui32 i = repeat; i > 0; --i)
231 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
232 else
233 for (ui32 i = repeat; i > 0; --i)
234 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
235 }
236 }
237
239 static
240 void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig,
241 const line_buf* other, const line_buf* aug,
242 ui32 repeat, bool synthesis)
243 {
244 const si64 a = s->rev.Aatk;
245 const si64 b = s->rev.Batk;
246 const ui8 e = s->rev.Eatk;
247 __m128i vb = _mm_set1_epi64x(b);
248 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
249
250 si64* dst = aug->i64;
251 const si64* src1 = sig->i64, * src2 = other->i64;
252 // The general definition of the wavelet in Part 2 is slightly
253 // different to part 2, although they are mathematically equivalent
254 // here, we identify the simpler form from Part 1 and employ them
255 if (a == 1)
256 { // 5/3 update and any case with a == 1
257 int i = (int)repeat;
258 if (synthesis)
259 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
260 {
261 __m128i s1 = _mm_load_si128((__m128i*)src1);
262 __m128i s2 = _mm_load_si128((__m128i*)src2);
263 __m128i d = _mm_load_si128((__m128i*)dst);
264 __m128i t = _mm_add_epi64(s1, s2);
265 __m128i v = _mm_add_epi64(vb, t);
266 __m128i w = sse2_mm_srai_epi64(v, e, ve);
267 d = _mm_sub_epi64(d, w);
268 _mm_store_si128((__m128i*)dst, d);
269 }
270 else
271 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
272 {
273 __m128i s1 = _mm_load_si128((__m128i*)src1);
274 __m128i s2 = _mm_load_si128((__m128i*)src2);
275 __m128i d = _mm_load_si128((__m128i*)dst);
276 __m128i t = _mm_add_epi64(s1, s2);
277 __m128i v = _mm_add_epi64(vb, t);
278 __m128i w = sse2_mm_srai_epi64(v, e, ve);
279 d = _mm_add_epi64(d, w);
280 _mm_store_si128((__m128i*)dst, d);
281 }
282 }
283 else if (a == -1 && b == 1 && e == 1)
284 { // 5/3 predict
285 int i = (int)repeat;
286 if (synthesis)
287 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
288 {
289 __m128i s1 = _mm_load_si128((__m128i*)src1);
290 __m128i s2 = _mm_load_si128((__m128i*)src2);
291 __m128i d = _mm_load_si128((__m128i*)dst);
292 __m128i t = _mm_add_epi64(s1, s2);
293 __m128i w = sse2_mm_srai_epi64(t, e, ve);
294 d = _mm_add_epi64(d, w);
295 _mm_store_si128((__m128i*)dst, d);
296 }
297 else
298 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
299 {
300 __m128i s1 = _mm_load_si128((__m128i*)src1);
301 __m128i s2 = _mm_load_si128((__m128i*)src2);
302 __m128i d = _mm_load_si128((__m128i*)dst);
303 __m128i t = _mm_add_epi64(s1, s2);
304 __m128i w = sse2_mm_srai_epi64(t, e, ve);
305 d = _mm_sub_epi64(d, w);
306 _mm_store_si128((__m128i*)dst, d);
307 }
308 }
309 else if (a == -1)
310 { // any case with a == -1, which is not 5/3 predict
311 int i = (int)repeat;
312 if (synthesis)
313 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
314 {
315 __m128i s1 = _mm_load_si128((__m128i*)src1);
316 __m128i s2 = _mm_load_si128((__m128i*)src2);
317 __m128i d = _mm_load_si128((__m128i*)dst);
318 __m128i t = _mm_add_epi64(s1, s2);
319 __m128i v = _mm_sub_epi64(vb, t);
320 __m128i w = sse2_mm_srai_epi64(v, e, ve);
321 d = _mm_sub_epi64(d, w);
322 _mm_store_si128((__m128i*)dst, d);
323 }
324 else
325 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
326 {
327 __m128i s1 = _mm_load_si128((__m128i*)src1);
328 __m128i s2 = _mm_load_si128((__m128i*)src2);
329 __m128i d = _mm_load_si128((__m128i*)dst);
330 __m128i t = _mm_add_epi64(s1, s2);
331 __m128i v = _mm_sub_epi64(vb, t);
332 __m128i w = sse2_mm_srai_epi64(v, e, ve);
333 d = _mm_add_epi64(d, w);
334 _mm_store_si128((__m128i*)dst, d);
335 }
336 }
337 else { // general case
338 // 64bit multiplication is not supported in sse2
339 if (synthesis)
340 for (ui32 i = repeat; i > 0; --i)
341 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
342 else
343 for (ui32 i = repeat; i > 0; --i)
344 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
345 }
346 }
347
349 void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig,
350 const line_buf* other, const line_buf* aug,
351 ui32 repeat, bool synthesis)
352 {
353 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
354 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
355 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
356 {
357 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
358 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
359 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
360 sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
361 }
362 else
363 {
364 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
365 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
366 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
367 sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
368 }
369 }
370
372 static
373 void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
374 const line_buf* hdst, const line_buf* src,
375 ui32 width, bool even)
376 {
377 if (width > 1)
378 {
379 // split src into ldst and hdst
380 {
381 float* dpl = even ? ldst->f32 : hdst->f32;
382 float* dph = even ? hdst->f32 : ldst->f32;
383 float* sp = src->f32;
384 int w = (int)width;
385 sse2_deinterleave32(dpl, dph, sp, w);
386 }
387
388 si32* hp = hdst->i32, * lp = ldst->i32;
389 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
390 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
391 ui32 num_steps = atk->get_num_steps();
392 for (ui32 j = num_steps; j > 0; --j)
393 {
394 // first lifting step
395 const lifting_step* s = atk->get_step(j - 1);
396 const si32 a = s->rev.Aatk;
397 const si32 b = s->rev.Batk;
398 const ui8 e = s->rev.Eatk;
399 __m128i vb = _mm_set1_epi32(b);
400
401 // extension
402 lp[-1] = lp[0];
403 lp[l_width] = lp[l_width - 1];
404 // lifting step
405 const si32* sp = lp;
406 si32* dp = hp;
407 if (a == 1)
408 { // 5/3 update and any case with a == 1
409 int i = (int)h_width;
410 if (even)
411 {
412 for (; i > 0; i -= 4, sp += 4, dp += 4)
413 {
414 __m128i s1 = _mm_load_si128((__m128i*)sp);
415 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
416 __m128i d = _mm_load_si128((__m128i*)dp);
417 __m128i t = _mm_add_epi32(s1, s2);
418 __m128i v = _mm_add_epi32(vb, t);
419 __m128i w = _mm_srai_epi32(v, e);
420 d = _mm_add_epi32(d, w);
421 _mm_store_si128((__m128i*)dp, d);
422 }
423 }
424 else
425 {
426 for (; i > 0; i -= 4, sp += 4, dp += 4)
427 {
428 __m128i s1 = _mm_load_si128((__m128i*)sp);
429 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
430 __m128i d = _mm_load_si128((__m128i*)dp);
431 __m128i t = _mm_add_epi32(s1, s2);
432 __m128i v = _mm_add_epi32(vb, t);
433 __m128i w = _mm_srai_epi32(v, e);
434 d = _mm_add_epi32(d, w);
435 _mm_store_si128((__m128i*)dp, d);
436 }
437 }
438 }
439 else if (a == -1 && b == 1 && e == 1)
440 { // 5/3 predict
441 int i = (int)h_width;
442 if (even)
443 for (; i > 0; i -= 4, sp += 4, dp += 4)
444 {
445 __m128i s1 = _mm_load_si128((__m128i*)sp);
446 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
447 __m128i d = _mm_load_si128((__m128i*)dp);
448 __m128i t = _mm_add_epi32(s1, s2);
449 __m128i w = _mm_srai_epi32(t, e);
450 d = _mm_sub_epi32(d, w);
451 _mm_store_si128((__m128i*)dp, d);
452 }
453 else
454 for (; i > 0; i -= 4, sp += 4, dp += 4)
455 {
456 __m128i s1 = _mm_load_si128((__m128i*)sp);
457 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
458 __m128i d = _mm_load_si128((__m128i*)dp);
459 __m128i t = _mm_add_epi32(s1, s2);
460 __m128i w = _mm_srai_epi32(t, e);
461 d = _mm_sub_epi32(d, w);
462 _mm_store_si128((__m128i*)dp, d);
463 }
464 }
465 else if (a == -1)
466 { // any case with a == -1, which is not 5/3 predict
467 int i = (int)h_width;
468 if (even)
469 for (; i > 0; i -= 4, sp += 4, dp += 4)
470 {
471 __m128i s1 = _mm_load_si128((__m128i*)sp);
472 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
473 __m128i d = _mm_load_si128((__m128i*)dp);
474 __m128i t = _mm_add_epi32(s1, s2);
475 __m128i v = _mm_sub_epi32(vb, t);
476 __m128i w = _mm_srai_epi32(v, e);
477 d = _mm_add_epi32(d, w);
478 _mm_store_si128((__m128i*)dp, d);
479 }
480 else
481 for (; i > 0; i -= 4, sp += 4, dp += 4)
482 {
483 __m128i s1 = _mm_load_si128((__m128i*)sp);
484 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
485 __m128i d = _mm_load_si128((__m128i*)dp);
486 __m128i t = _mm_add_epi32(s1, s2);
487 __m128i v = _mm_sub_epi32(vb, t);
488 __m128i w = _mm_srai_epi32(v, e);
489 d = _mm_add_epi32(d, w);
490 _mm_store_si128((__m128i*)dp, d);
491 }
492 }
493 else {
494 // general case
495 // 64bit multiplication is not supported in sse2
496 if (even)
497 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
498 *dp += (b + a * (sp[0] + sp[1])) >> e;
499 else
500 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
501 *dp += (b + a * (sp[-1] + sp[0])) >> e;
502 }
503
504 // swap buffers
505 si32* t = lp; lp = hp; hp = t;
506 even = !even;
507 ui32 w = l_width; l_width = h_width; h_width = w;
508 }
509 }
510 else {
511 if (even)
512 ldst->i32[0] = src->i32[0];
513 else
514 hdst->i32[0] = src->i32[0] << 1;
515 }
516 }
517
519 static
520 void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
521 const line_buf* hdst, const line_buf* src,
522 ui32 width, bool even)
523 {
524 if (width > 1)
525 {
526 // split src into ldst and hdst
527 {
528 double* dpl = (double*)(even ? ldst->p : hdst->p);
529 double* dph = (double*)(even ? hdst->p : ldst->p);
530 double* sp = (double*)src->p;
531 int w = (int)width;
532 sse2_deinterleave64(dpl, dph, sp, w);
533 }
534
535 si64* hp = hdst->i64, * lp = ldst->i64;
536 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
537 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
538 ui32 num_steps = atk->get_num_steps();
539 for (ui32 j = num_steps; j > 0; --j)
540 {
541 // first lifting step
542 const lifting_step* s = atk->get_step(j - 1);
543 const si32 a = s->rev.Aatk;
544 const si32 b = s->rev.Batk;
545 const ui8 e = s->rev.Eatk;
546 __m128i vb = _mm_set1_epi64x(b);
547 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
548
549 // extension
550 lp[-1] = lp[0];
551 lp[l_width] = lp[l_width - 1];
552 // lifting step
553 const si64* sp = lp;
554 si64* dp = hp;
555 if (a == 1)
556 { // 5/3 update and any case with a == 1
557 int i = (int)h_width;
558 if (even)
559 {
560 for (; i > 0; i -= 2, sp += 2, dp += 2)
561 {
562 __m128i s1 = _mm_load_si128((__m128i*)sp);
563 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
564 __m128i d = _mm_load_si128((__m128i*)dp);
565 __m128i t = _mm_add_epi64(s1, s2);
566 __m128i v = _mm_add_epi64(vb, t);
567 __m128i w = sse2_mm_srai_epi64(v, e, ve);
568 d = _mm_add_epi64(d, w);
569 _mm_store_si128((__m128i*)dp, d);
570 }
571 }
572 else
573 {
574 for (; i > 0; i -= 2, sp += 2, dp += 2)
575 {
576 __m128i s1 = _mm_load_si128((__m128i*)sp);
577 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
578 __m128i d = _mm_load_si128((__m128i*)dp);
579 __m128i t = _mm_add_epi64(s1, s2);
580 __m128i v = _mm_add_epi64(vb, t);
581 __m128i w = sse2_mm_srai_epi64(v, e, ve);
582 d = _mm_add_epi64(d, w);
583 _mm_store_si128((__m128i*)dp, d);
584 }
585 }
586 }
587 else if (a == -1 && b == 1 && e == 1)
588 { // 5/3 predict
589 int i = (int)h_width;
590 if (even)
591 for (; i > 0; i -= 2, sp += 2, dp += 2)
592 {
593 __m128i s1 = _mm_load_si128((__m128i*)sp);
594 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
595 __m128i d = _mm_load_si128((__m128i*)dp);
596 __m128i t = _mm_add_epi64(s1, s2);
597 __m128i w = sse2_mm_srai_epi64(t, e, ve);
598 d = _mm_sub_epi64(d, w);
599 _mm_store_si128((__m128i*)dp, d);
600 }
601 else
602 for (; i > 0; i -= 2, sp += 2, dp += 2)
603 {
604 __m128i s1 = _mm_load_si128((__m128i*)sp);
605 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
606 __m128i d = _mm_load_si128((__m128i*)dp);
607 __m128i t = _mm_add_epi64(s1, s2);
608 __m128i w = sse2_mm_srai_epi64(t, e, ve);
609 d = _mm_sub_epi64(d, w);
610 _mm_store_si128((__m128i*)dp, d);
611 }
612 }
613 else if (a == -1)
614 { // any case with a == -1, which is not 5/3 predict
615 int i = (int)h_width;
616 if (even)
617 for (; i > 0; i -= 2, sp += 2, dp += 2)
618 {
619 __m128i s1 = _mm_load_si128((__m128i*)sp);
620 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
621 __m128i d = _mm_load_si128((__m128i*)dp);
622 __m128i t = _mm_add_epi64(s1, s2);
623 __m128i v = _mm_sub_epi64(vb, t);
624 __m128i w = sse2_mm_srai_epi64(v, e, ve);
625 d = _mm_add_epi64(d, w);
626 _mm_store_si128((__m128i*)dp, d);
627 }
628 else
629 for (; i > 0; i -= 2, sp += 2, dp += 2)
630 {
631 __m128i s1 = _mm_load_si128((__m128i*)sp);
632 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
633 __m128i d = _mm_load_si128((__m128i*)dp);
634 __m128i t = _mm_add_epi64(s1, s2);
635 __m128i v = _mm_sub_epi64(vb, t);
636 __m128i w = sse2_mm_srai_epi64(v, e, ve);
637 d = _mm_add_epi64(d, w);
638 _mm_store_si128((__m128i*)dp, d);
639 }
640 }
641 else {
642 // general case
643 // 64bit multiplication is not supported in sse2
644 if (even)
645 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
646 *dp += (b + a * (sp[0] + sp[1])) >> e;
647 else
648 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
649 *dp += (b + a * (sp[-1] + sp[0])) >> e;
650 }
651
652 // swap buffers
653 si64* t = lp; lp = hp; hp = t;
654 even = !even;
655 ui32 w = l_width; l_width = h_width; h_width = w;
656 }
657 }
658 else {
659 if (even)
660 ldst->i64[0] = src->i64[0];
661 else
662 hdst->i64[0] = src->i64[0] << 1;
663 }
664 }
665
667 void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
668 const line_buf* hdst, const line_buf* src,
669 ui32 width, bool even)
670 {
671 if (src->flags & line_buf::LFT_32BIT)
672 {
673 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
674 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
675 sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
676 }
677 else
678 {
679 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
680 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
681 (src == NULL || src->flags & line_buf::LFT_64BIT));
682 sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
683 }
684 }
685
687 void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
688 const line_buf* lsrc, const line_buf* hsrc,
689 ui32 width, bool even)
690 {
691 if (width > 1)
692 {
693 bool ev = even;
694 si32* oth = hsrc->i32, * aug = lsrc->i32;
695 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
696 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
697 ui32 num_steps = atk->get_num_steps();
698 for (ui32 j = 0; j < num_steps; ++j)
699 {
700 const lifting_step* s = atk->get_step(j);
701 const si32 a = s->rev.Aatk;
702 const si32 b = s->rev.Batk;
703 const ui8 e = s->rev.Eatk;
704 __m128i vb = _mm_set1_epi32(b);
705
706 // extension
707 oth[-1] = oth[0];
708 oth[oth_width] = oth[oth_width - 1];
709 // lifting step
710 const si32* sp = oth;
711 si32* dp = aug;
712 if (a == 1)
713 { // 5/3 update and any case with a == 1
714 int i = (int)aug_width;
715 if (ev)
716 {
717 for (; i > 0; i -= 4, sp += 4, dp += 4)
718 {
719 __m128i s1 = _mm_load_si128((__m128i*)sp);
720 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
721 __m128i d = _mm_load_si128((__m128i*)dp);
722 __m128i t = _mm_add_epi32(s1, s2);
723 __m128i v = _mm_add_epi32(vb, t);
724 __m128i w = _mm_srai_epi32(v, e);
725 d = _mm_sub_epi32(d, w);
726 _mm_store_si128((__m128i*)dp, d);
727 }
728 }
729 else
730 {
731 for (; i > 0; i -= 4, sp += 4, dp += 4)
732 {
733 __m128i s1 = _mm_load_si128((__m128i*)sp);
734 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
735 __m128i d = _mm_load_si128((__m128i*)dp);
736 __m128i t = _mm_add_epi32(s1, s2);
737 __m128i v = _mm_add_epi32(vb, t);
738 __m128i w = _mm_srai_epi32(v, e);
739 d = _mm_sub_epi32(d, w);
740 _mm_store_si128((__m128i*)dp, d);
741 }
742 }
743 }
744 else if (a == -1 && b == 1 && e == 1)
745 { // 5/3 predict
746 int i = (int)aug_width;
747 if (ev)
748 for (; i > 0; i -= 4, sp += 4, dp += 4)
749 {
750 __m128i s1 = _mm_load_si128((__m128i*)sp);
751 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
752 __m128i d = _mm_load_si128((__m128i*)dp);
753 __m128i t = _mm_add_epi32(s1, s2);
754 __m128i w = _mm_srai_epi32(t, e);
755 d = _mm_add_epi32(d, w);
756 _mm_store_si128((__m128i*)dp, d);
757 }
758 else
759 for (; i > 0; i -= 4, sp += 4, dp += 4)
760 {
761 __m128i s1 = _mm_load_si128((__m128i*)sp);
762 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
763 __m128i d = _mm_load_si128((__m128i*)dp);
764 __m128i t = _mm_add_epi32(s1, s2);
765 __m128i w = _mm_srai_epi32(t, e);
766 d = _mm_add_epi32(d, w);
767 _mm_store_si128((__m128i*)dp, d);
768 }
769 }
770 else if (a == -1)
771 { // any case with a == -1, which is not 5/3 predict
772 int i = (int)aug_width;
773 if (ev)
774 for (; i > 0; i -= 4, sp += 4, dp += 4)
775 {
776 __m128i s1 = _mm_load_si128((__m128i*)sp);
777 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
778 __m128i d = _mm_load_si128((__m128i*)dp);
779 __m128i t = _mm_add_epi32(s1, s2);
780 __m128i v = _mm_sub_epi32(vb, t);
781 __m128i w = _mm_srai_epi32(v, e);
782 d = _mm_sub_epi32(d, w);
783 _mm_store_si128((__m128i*)dp, d);
784 }
785 else
786 for (; i > 0; i -= 4, sp += 4, dp += 4)
787 {
788 __m128i s1 = _mm_load_si128((__m128i*)sp);
789 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
790 __m128i d = _mm_load_si128((__m128i*)dp);
791 __m128i t = _mm_add_epi32(s1, s2);
792 __m128i v = _mm_sub_epi32(vb, t);
793 __m128i w = _mm_srai_epi32(v, e);
794 d = _mm_sub_epi32(d, w);
795 _mm_store_si128((__m128i*)dp, d);
796 }
797 }
798 else {
799 // general case
800 // 32bit multiplication is not supported in sse2; we need sse4.1,
801 // where we can use _mm_mullo_epi32, which multiplies
802 // 32bit x 32bit, keeping the LSBs
803 if (ev)
804 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
805 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
806 else
807 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
808 *dp -= (b + a * (sp[0] + sp[1])) >> e;
809 }
810
811 // swap buffers
812 si32* t = aug; aug = oth; oth = t;
813 ev = !ev;
814 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
815 }
816
817 // combine both lsrc and hsrc into dst
818 {
819 float* dp = dst->f32;
820 float* spl = even ? lsrc->f32 : hsrc->f32;
821 float* sph = even ? hsrc->f32 : lsrc->f32;
822 int w = (int)width;
823 sse2_interleave32(dp, spl, sph, w);
824 }
825 }
826 else {
827 if (even)
828 dst->i32[0] = lsrc->i32[0];
829 else
830 dst->i32[0] = hsrc->i32[0] >> 1;
831 }
832 }
833
835 void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
836 const line_buf* lsrc, const line_buf* hsrc,
837 ui32 width, bool even)
838 {
839 if (width > 1)
840 {
841 bool ev = even;
842 si64* oth = hsrc->i64, * aug = lsrc->i64;
843 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
844 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
845 ui32 num_steps = atk->get_num_steps();
846 for (ui32 j = 0; j < num_steps; ++j)
847 {
848 const lifting_step* s = atk->get_step(j);
849 const si32 a = s->rev.Aatk;
850 const si32 b = s->rev.Batk;
851 const ui8 e = s->rev.Eatk;
852 __m128i vb = _mm_set1_epi64x(b);
853 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
854
855 // extension
856 oth[-1] = oth[0];
857 oth[oth_width] = oth[oth_width - 1];
858 // lifting step
859 const si64* sp = oth;
860 si64* dp = aug;
861 if (a == 1)
862 { // 5/3 update and any case with a == 1
863 int i = (int)aug_width;
864 if (ev)
865 {
866 for (; i > 0; i -= 2, sp += 2, dp += 2)
867 {
868 __m128i s1 = _mm_load_si128((__m128i*)sp);
869 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
870 __m128i d = _mm_load_si128((__m128i*)dp);
871 __m128i t = _mm_add_epi64(s1, s2);
872 __m128i v = _mm_add_epi64(vb, t);
873 __m128i w = sse2_mm_srai_epi64(v, e, ve);
874 d = _mm_sub_epi64(d, w);
875 _mm_store_si128((__m128i*)dp, d);
876 }
877 }
878 else
879 {
880 for (; i > 0; i -= 2, sp += 2, dp += 2)
881 {
882 __m128i s1 = _mm_load_si128((__m128i*)sp);
883 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
884 __m128i d = _mm_load_si128((__m128i*)dp);
885 __m128i t = _mm_add_epi64(s1, s2);
886 __m128i v = _mm_add_epi64(vb, t);
887 __m128i w = sse2_mm_srai_epi64(v, e, ve);
888 d = _mm_sub_epi64(d, w);
889 _mm_store_si128((__m128i*)dp, d);
890 }
891 }
892 }
893 else if (a == -1 && b == 1 && e == 1)
894 { // 5/3 predict
895 int i = (int)aug_width;
896 if (ev)
897 for (; i > 0; i -= 2, sp += 2, dp += 2)
898 {
899 __m128i s1 = _mm_load_si128((__m128i*)sp);
900 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
901 __m128i d = _mm_load_si128((__m128i*)dp);
902 __m128i t = _mm_add_epi64(s1, s2);
903 __m128i w = sse2_mm_srai_epi64(t, e, ve);
904 d = _mm_add_epi64(d, w);
905 _mm_store_si128((__m128i*)dp, d);
906 }
907 else
908 for (; i > 0; i -= 2, sp += 2, dp += 2)
909 {
910 __m128i s1 = _mm_load_si128((__m128i*)sp);
911 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
912 __m128i d = _mm_load_si128((__m128i*)dp);
913 __m128i t = _mm_add_epi64(s1, s2);
914 __m128i w = sse2_mm_srai_epi64(t, e, ve);
915 d = _mm_add_epi64(d, w);
916 _mm_store_si128((__m128i*)dp, d);
917 }
918 }
919 else if (a == -1)
920 { // any case with a == -1, which is not 5/3 predict
921 int i = (int)aug_width;
922 if (ev)
923 for (; i > 0; i -= 2, sp += 2, dp += 2)
924 {
925 __m128i s1 = _mm_load_si128((__m128i*)sp);
926 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
927 __m128i d = _mm_load_si128((__m128i*)dp);
928 __m128i t = _mm_add_epi64(s1, s2);
929 __m128i v = _mm_sub_epi64(vb, t);
930 __m128i w = sse2_mm_srai_epi64(v, e, ve);
931 d = _mm_sub_epi64(d, w);
932 _mm_store_si128((__m128i*)dp, d);
933 }
934 else
935 for (; i > 0; i -= 2, sp += 2, dp += 2)
936 {
937 __m128i s1 = _mm_load_si128((__m128i*)sp);
938 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
939 __m128i d = _mm_load_si128((__m128i*)dp);
940 __m128i t = _mm_add_epi64(s1, s2);
941 __m128i v = _mm_sub_epi64(vb, t);
942 __m128i w = sse2_mm_srai_epi64(v, e, ve);
943 d = _mm_sub_epi64(d, w);
944 _mm_store_si128((__m128i*)dp, d);
945 }
946 }
947 else {
948 // general case
949 // 64bit multiplication is not supported in sse2
950 if (ev)
951 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
952 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
953 else
954 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
955 *dp -= (b + a * (sp[0] + sp[1])) >> e;
956 }
957
958 // swap buffers
959 si64* t = aug; aug = oth; oth = t;
960 ev = !ev;
961 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
962 }
963
964 // combine both lsrc and hsrc into dst
965 {
966 double* dp = (double*)dst->p;
967 double* spl = (double*)(even ? lsrc->p : hsrc->p);
968 double* sph = (double*)(even ? hsrc->p : lsrc->p);
969 int w = (int)width;
970 sse2_interleave64(dp, spl, sph, w);
971 }
972 }
973 else {
974 if (even)
975 dst->i64[0] = lsrc->i64[0];
976 else
977 dst->i64[0] = hsrc->i64[0] >> 1;
978 }
979 }
980
982 void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
983 const line_buf* lsrc, const line_buf* hsrc,
984 ui32 width, bool even)
985 {
986 if (dst->flags & line_buf::LFT_32BIT)
987 {
988 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
989 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
990 sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
991 }
992 else
993 {
994 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
995 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
996 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
997 sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
998 }
999 }
1000
1001 } // !local
1002} // !ojph
float * f32
Definition ojph_mem.h:174
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse2_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void sse2_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void sse2_deinterleave64(double *dpl, double *dph, double *sp, int width)
static void sse2_interleave32(float *dp, float *spl, float *sph, int width)
static __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
void sse2_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse2_deinterleave32(float *dpl, float *dph, float *sp, int width)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void sse2_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void sse2_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void sse2_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void sse2_interleave64(double *dp, double *spl, double *sph, int width)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
const lifting_step * get_step(ui32 s) const