1 module std.simd; 2 3 /* 4 pure: 5 nothrow: 6 @safe: 7 @nogc: 8 */ 9 10 /////////////////////////////////////////////////////////////////////////////// 11 // Version mess 12 /////////////////////////////////////////////////////////////////////////////// 13 14 version(X86) 15 { 16 version(DigitalMars) 17 version = NoSIMD; // DMD-x86 does not support SIMD 18 else 19 version = X86_OR_X64; 20 } 21 else version(X86_64) 22 { 23 version = X86_OR_X64; 24 } 25 else version(PPC) 26 version = PowerPC; 27 else version(PPC64) 28 version = PowerPC; 29 30 version (X86_OR_X64) 31 version = HaveSIMD; 32 version (PowerPC) 33 version = HaveSIMD; 34 35 version (HaveSIMD): 36 37 version(GNU) 38 version = GNU_OR_LDC; 39 version(LDC) 40 version = GNU_OR_LDC; 41 42 /////////////////////////////////////////////////////////////////////////////// 43 // Platform specific imports 44 /////////////////////////////////////////////////////////////////////////////// 45 46 version(DigitalMars) 47 { 48 // DMD intrinsics 49 } 50 else version(GNU) 51 { 52 // GDC intrinsics 53 import gcc.builtins; 54 } 55 56 version(GNU){} else 57 { 58 struct attribute 59 { 60 string attrib; 61 string value; 62 } 63 } 64 65 public import core.simd; 66 import std.traits, std.typetuple; 67 68 69 /////////////////////////////////////////////////////////////////////////////// 70 // Define available versions of vector hardware 71 /////////////////////////////////////////////////////////////////////////////// 72 73 version(X86_OR_X64) 74 { 75 enum SIMDVer 76 { 77 SSE, 78 SSE2, 79 SSE3, // Later Pentium4 + Athlon64 80 SSSE3, // Introduced in Intel 'Core' series, AMD 'Bobcat' 81 SSE41, // (Intel) Introduced in 45nm 'Core' series 82 SSE42, // (Intel) Introduced in i7 83 SSE4a, // (AMD) Introduced to 'Bobcat' (includes SSSE3 and below) 84 SSE5, // (AMD) XOP, FMA4 and CVT16. Introduced to 'Bulldozer' (includes ALL prior architectures) 85 AVX, // 256bit, 16regs, 3 operand opcodes, no integer 86 AVX2, // integer support for AVX 87 AVX512 // 512bit, 32regs 88 } 89 90 // we source this from the compiler flags, ie. -msse2 for instance 91 immutable SIMDVer simdVer = SIMDVer.SSE42; 92 93 enum string[SIMDVer.max+1] targetNames = 94 [ 95 "sse", 96 "sse2", 97 "sse3", 98 "ssse3", 99 "sse4.1", 100 "sse4.2", 101 "sse4a", 102 "sse5", 103 "avx", 104 "avx2", 105 "avx512" 106 ]; 107 } 108 else version(ARM) 109 { 110 enum SIMDVer 111 { 112 VFP, // Should we implement this? it's deprecated on modern ARM chips 113 NEON, // Added to Cortex-A8, Snapdragon 114 VFPv4 // Added to Cortex-A15 115 } 116 117 immutable SIMDVer simdVer = SIMDVer.NEON; 118 119 enum string[SIMDVer.max+1] targetNames = 120 [ 121 "", "", "" // TODO... 122 ]; 123 } 124 else version(PowerPC) 125 { 126 enum SIMDVer 127 { 128 VMX, 129 VMX128, // Extended register file (128 regs), reduced integer support, and some awesome bonus opcodes 130 PairedSingle // Used on Nintendo platforms 131 } 132 133 immutable SIMDVer simdVer = SIMDVer.VMX; 134 135 enum string[SIMDVer.max+1] targetNames = 136 [ 137 "altivec", 138 "", // 'vmx128' doesn't exist... 139 "" 140 ]; 141 } 142 else version(MIPS_SIMD) 143 { 144 enum SIMDVer 145 { 146 Unknown, 147 148 PairedSingle, // 32bit pairs in 64bit regs 149 MIPS3D, // Licensed MIPS SIMD extension 150 MDMX, // More comprehensive SIMD extension 151 XBurst1, // XBurst1 custom SIMD (Android) 152 PSP_VFPU // SIMD extension used by the Playstation Portable 153 } 154 155 immutable SIMDVer simdVer = SIMDVer.Unknown; 156 157 enum string[SIMDVer.max+1] targetNames = 158 [ 159 "", "", "", "", "", "" // TODO... 160 ]; 161 } 162 else 163 { 164 // TODO: it would be nice to provide a fallback for __ctfe and hardware with no SIMD unit... 165 166 enum SIMDVer 167 { 168 None 169 } 170 171 immutable SIMDVer simdVer = SIMDVer.None; 172 173 enum string[SIMDVer.max+1] targetNames = 174 [ 175 "" 176 ]; 177 } 178 179 /////////////////////////////////////////////////////////////////////////////// 180 // LLVM instructions and intrinsics for LDC. 181 /////////////////////////////////////////////////////////////////////////////// 182 183 version(LDC) 184 { 185 template RepeatType(T, size_t n, R...) 186 { 187 static if(n == 0) 188 alias RepeatType = R; 189 else 190 alias RepeatType = RepeatType!(T, n - 1, T, R); 191 } 192 193 version(X86_OR_X64) 194 import ldc.gccbuiltins_x86; 195 196 import ldcsimd = ldc.simd; 197 198 alias PblendvbParam = byte16; 199 } 200 else version(GNU) 201 { 202 alias PblendvbParam = ubyte16; 203 } 204 205 version(GNU) 206 version = GNU_OR_LDC; 207 version(LDC) 208 version = GNU_OR_LDC; 209 210 /////////////////////////////////////////////////////////////////////////////// 211 // Internal constants 212 /////////////////////////////////////////////////////////////////////////////// 213 214 private 215 { 216 enum ulong2 signMask2 = 0x8000_0000_0000_0000; 217 enum uint4 signMask4 = 0x8000_0000; 218 enum ushort8 signMask8 = 0x8000; 219 enum ubyte16 signMask16 = 0x80; 220 } 221 222 /////////////////////////////////////////////////////////////////////////////// 223 // Internal functions 224 /////////////////////////////////////////////////////////////////////////////// 225 226 // TODO: deprecated; moved to std.traits... 227 enum bool isSIMDVector(T) = is(T : __vector(V[N]), V, size_t N); 228 229 template ElementType(T : __vector(V[N]), V, size_t N) if(isSIMDVector!T) 230 { 231 alias Impl(T) = V; 232 static if (__VERSION__ < 2068) 233 alias ElementType = std.traits.ModifyTypePreservingSTC!(Impl, OriginalType!T); 234 else 235 alias ElementType = std.traits.ModifyTypePreservingTQ!(Impl, OriginalType!T); 236 } 237 238 enum NumElements(T : __vector(V[N]), V, size_t N) = N; 239 240 template PromotionOf(T) 241 { 242 template Impl(T) 243 { 244 static if(is(T : __vector(V[N]), V, size_t N)) 245 alias Impl = __vector(Impl!V[N/2]); 246 else static if(is(T == float)) 247 alias Impl = double; 248 else static if(is(T == int)) 249 alias Impl = long; 250 else static if(is(T == uint)) 251 alias Impl = ulong; 252 else static if(is(T == short)) 253 alias Impl = int; 254 else static if(is(T == ushort)) 255 alias Impl = uint; 256 else static if(is(T == byte)) 257 alias Impl = short; 258 else static if(is(T == ubyte)) 259 alias Impl = ushort; 260 else 261 static assert(0, "Incorrect type: " ~ T.stringof); 262 } 263 264 alias PromotionOf = std.traits.ModifyTypePreservingSTC!(Impl, OriginalType!T); 265 } 266 template DemotionOf(T) 267 { 268 template Impl(T) 269 { 270 static if(is(T : __vector(V[N]), V, size_t N)) 271 alias Impl = __vector(Impl!V[N*2]); 272 else static if(is(T == double)) 273 alias Impl = float; 274 else static if(is(T == long)) 275 alias Impl = int; 276 else static if(is(T == ulong)) 277 alias Impl = uint; 278 else static if(is(T == int)) 279 alias Impl = short; 280 else static if(is(T == uint)) 281 alias Impl = ushort; 282 else static if(is(T == short)) 283 alias Impl = byte; 284 else static if(is(T == ushort)) 285 alias Impl = ubyte; 286 else 287 static assert(0, "Incorrect type: " ~ T.stringof); 288 } 289 290 alias DemotionOf = std.traits.ModifyTypePreservingSTC!(Impl, OriginalType!T); 291 } 292 293 private 294 { 295 enum bool isOfType(U, V) = is(Unqual!U == Unqual!V); 296 297 // pull the base type from a vector, array, or primitive 298 // type. The first version does not work for vectors. 299 template ArrayType(T : T[]) { alias T ArrayType; } 300 template ArrayType(T) if(isSIMDVector!T) 301 { 302 // typeof T.array.init does not work for some reason, so we use this 303 alias typeof(() 304 { 305 T a; 306 return a.array; 307 }()) ArrayType; 308 } 309 template BaseType(T) 310 { 311 static if(isSIMDVector!T) 312 alias ElementType!T BaseType; 313 else static if(isArray!T) 314 alias ArrayType!T BaseType; 315 else static if(isScalar!T) 316 alias T BaseType; 317 else 318 static assert(0, "Unsupported type"); 319 } 320 321 template isScalarFloat(T) 322 { 323 alias U = Unqual!T; 324 enum bool isScalarFloat = is(U == float) || is(U == double); 325 } 326 327 template isScalarInt(T) 328 { 329 alias U = Unqual!T; 330 enum bool isScalarInt = is(U == long) || is(U == ulong) || is(U == int) || is(U == uint) || is(U == short) || is(U == ushort) || is(U == byte) || is(U == ubyte); 331 } 332 333 template isScalarUnsigned(T) 334 { 335 alias U = Unqual!T; 336 enum bool isScalarUnsigned = is(U == ulong) || is(U == uint) || is(U == ushort) || is(U == ubyte); 337 } 338 339 enum bool isScalar(T) = isScalarFloat!T || isScalarInt!T; 340 enum bool isFloatArray(T) = isArray!T && isScalarFloat!(BaseType!T); 341 enum bool isIntArray(T) = isArray!T && isScalarInt!(BaseType!T); 342 enum bool isFloatVector(T) = isSIMDVector!T && isScalarFloat(BaseType!T); 343 enum bool isIntVector(T) = isSIMDVector!T && isScalarInt(BaseType!T); 344 enum bool isSigned(T) = isScalarInt!(BaseType!T) && !isScalarUnsigned!(BaseType!T); 345 enum bool isUnsigned(T) = isScalarUnsigned!(BaseType!T); 346 enum bool is64bitElement(T) = BaseType!(T).sizeof == 8; 347 enum bool is64bitInteger(T) = is64bitElement!T && isScalarInt!(BaseType!T); 348 enum bool is32bitElement(T) = BaseType!(T).sizeof == 4; 349 enum bool is16bitElement(T) = BaseType!(T).sizeof == 2; 350 enum bool is8bitElement(T) = BaseType!(T).sizeof == 1; 351 352 /**** Templates for generating TypeTuples ****/ 353 354 template staticIota(int start, int end, int stride = 1) 355 { 356 static if(start >= end) 357 alias staticIota = TypeTuple!(); 358 else 359 alias staticIota = TypeTuple!(start, staticIota!(start + stride, end, stride)); 360 } 361 362 template toTypeTuple(alias array, r...) 363 { 364 static if(array.length == r.length) 365 alias toTypeTuple = r; 366 else 367 alias toTypeTuple = toTypeTuple!(array, r, array[r.length]); 368 } 369 370 template interleaveTuples(a...) 371 { 372 static if(a.length == 0) 373 alias interleaveTuples = TypeTuple!(); 374 else 375 alias interleaveTuples = TypeTuple!(a[0], a[$ / 2], interleaveTuples!(a[1 .. $ / 2], a[$ / 2 + 1 .. $])); 376 } 377 378 /**** And some helpers for various architectures ****/ 379 version(X86_OR_X64) 380 { 381 template shufMask(elements...) 382 { 383 static if(elements.length == 2) 384 enum shufMask = ((elements[0] & 1) << 0) | ((elements[1] & 1) << 1); 385 else static if(elements.length == 4) 386 enum shufMask = ((elements[0] & 3) << 0) | ((elements[1] & 3) << 2) | ((elements[2] & 3) << 4) | ((elements[3] & 3) << 6); 387 else 388 static assert(0, "Incorrect number of elements"); 389 } 390 391 template pshufbMask(alias elements) 392 { 393 template c(a...) 394 { 395 static if(a.length == 0) 396 alias c = TypeTuple!(); 397 else 398 alias c = TypeTuple!(2 * a[0], 2 * a[0] + 1, c!(a[1 .. $])); 399 } 400 401 static if(elements.length == 16) 402 alias pshufbMask = toTypeTuple!elements; 403 else static if(elements.length == 8) 404 alias pshufbMask = c!(toTypeTuple!elements); 405 else 406 static assert(0, "Unsupported parameter length"); 407 } 408 } 409 410 version(ARM) 411 { 412 template ARMOpType(T, bool Rounded = false) 413 { 414 // NOTE: 0-unsigned, 1-signed, 2-poly, 3-float, 4-unsigned rounded, 5-signed rounded 415 static if(isOfType!(T, double2) || isOfType!(T, float4)) 416 enum uint ARMOpType = 3; 417 else static if(isOfType!(T, long2) || isOfType!(T, int4) || isOfType!(T, short8) || isOfType!(T, byte16)) 418 enum uint ARMOpType = 1 + (Rounded ? 4 : 0); 419 else static if(isOfType!(T, ulong2) || isOfType!(T, uint4) || isOfType!(T, ushort8) || isOfType!(T, ubyte16)) 420 enum uint ARMOpType = 0 + (Rounded ? 4 : 0); 421 else 422 static assert(0, "Incorrect type"); 423 } 424 } 425 } 426 427 428 /////////////////////////////////////////////////////////////////////////////// 429 // Public API 430 /////////////////////////////////////////////////////////////////////////////// 431 432 433 /////////////////////////////////////////////////////////////////////////////// 434 // Load and store 435 436 // load scalar into all components (!! or just X?). Note: SLOW on many architectures 437 Vector!T loadScalar(T, SIMDVer Ver = simdVer)(BaseType!T s) 438 { 439 return loadScalar!(T, Ver)(&s); 440 } 441 442 // load scaler from memory 443 T loadScalar(T, SIMDVer Ver = simdVer)(BaseType!T* pS) if(isSIMDVector!T) 444 { 445 version(X86_OR_X64) 446 { 447 version(DigitalMars) 448 { 449 static assert(0, "TODO"); 450 } 451 else version(GNU) 452 { 453 static if(isOfType(T, float4)) 454 return __builtin_ia32_loadss(pS); 455 else static if(isOfType(T, double2)) 456 return __builtin_ia32_loadddup(pV); 457 else 458 static assert(0, "TODO"); 459 } 460 else version(LDC) 461 { 462 //TODO: non-optimal 463 T r = 0; 464 r = ldcsimd.insertelement!(T, 0)(r, *pS); 465 return r; 466 } 467 } 468 else version(ARM) 469 { 470 static assert(0, "TODO"); 471 } 472 else 473 { 474 static assert(0, "Unsupported on this architecture"); 475 } 476 } 477 478 // load vector from an unaligned address 479 T loadUnaligned(T, SIMDVer Ver = simdVer)(BaseType!T* pV) @trusted 480 { 481 version(X86_OR_X64) 482 { 483 version(DigitalMars) 484 { 485 static assert(0, "TODO"); 486 } 487 else version(GNU) 488 { 489 static if(isOfType!(T, float4)) 490 return __builtin_ia32_loadups(pV); 491 else static if(isOfType!(T, double2)) 492 return __builtin_ia32_loadupd(pV); 493 else 494 return cast(Vector!T)__builtin_ia32_loaddqu(cast(char*)pV); 495 } 496 else version(LDC) 497 return ldcsimd.loadUnaligned!T(pV); 498 } 499 else version(ARM) 500 { 501 static assert(0, "TODO"); 502 } 503 else 504 { 505 static assert(0, "Unsupported on this architecture"); 506 } 507 } 508 509 // return the X element in a scalar register 510 BaseType!T getScalar(SIMDVer Ver = simdVer, T)(T v) if(isSIMDVector!T) 511 { 512 version(X86_OR_X64) 513 { 514 version(DigitalMars) 515 { 516 static assert(0, "TODO"); 517 } 518 else version(GNU) 519 { 520 static if(Ver >= SIMDVer.SSE41 && !is16bitElement!T) 521 { 522 static if(isOfType!(T, float4)) 523 return __builtin_ia32_vec_ext_v4sf(v, 0); 524 static if(isOfType!(T, double2)) 525 return __builtin_ia32_vec_ext_v2df(v, 0); 526 else static if(is64bitElement!T) 527 return __builtin_ia32_vec_ext_v2di(v, 0); 528 else static if(is32bitElement!T) 529 return __builtin_ia32_vec_ext_v4si(v, 0); 530 // else static if(is16bitElement!T) 531 // return __builtin_ia32_vec_ext_v8hi(v, 0); // does this opcode exist?? 532 else static if(is8bitElement!T) 533 return __builtin_ia32_vec_ext_v16qi(v, 0); 534 } 535 else 536 static assert(0, "Unsupported vector type: " ~ T.stringof); 537 } 538 else version(LDC) 539 { 540 return ldcsimd.extractelement!(T, 0)(v); 541 } 542 } 543 else version(ARM) 544 { 545 static assert(0, "TODO"); 546 } 547 else 548 { 549 static assert(0, "Unsupported on this architecture"); 550 } 551 } 552 553 // store the X element to the address provided 554 // If we use BaseType!T* as a parameter type, T can not be infered 555 // That's why we need to use template parameter S and check that it is 556 // the base type in the template constraint. We will use this in some other 557 // functions too. 558 void storeScalar(SIMDVer Ver = simdVer, T, S = BaseType!T)(T v, S* pS) if(isSIMDVector!T) 559 { 560 // TODO: check this optimises correctly!! (opcode writes directly to memory) 561 *pS = getScalar(v); 562 } 563 564 // store the vector to an unaligned address 565 void storeUnaligned(SIMDVer Ver = simdVer, T, S = BaseType!T)(T v, S* pV) @trusted if(isSIMDVector!T) 566 { 567 version(X86_OR_X64) 568 { 569 version(DigitalMars) 570 { 571 static assert(0, "TODO"); 572 } 573 else version(GNU_OR_LDC) 574 { 575 static if(isOfType!(T, float4)) 576 __builtin_ia32_storeups(pV, v); 577 else static if(isOfType!(T, double2)) 578 __builtin_ia32_storeupd(pV, v); 579 else 580 __builtin_ia32_storedqu(cast(char*)pV, cast(byte16)v); 581 } 582 } 583 else version(ARM) 584 { 585 static assert(0, "TODO"); 586 } 587 else 588 { 589 static assert(0, "Unsupported on this architecture"); 590 } 591 } 592 593 594 /////////////////////////////////////////////////////////////////////////////// 595 // Shuffle, swizzle, permutation 596 597 // broadcast X to all elements 598 T getX(SIMDVer Ver = simdVer, T)(inout T v) if(isSIMDVector!T) 599 { 600 // broadcast the first component 601 return swizzle!("0", Ver)(v); 602 } 603 604 // broadcast Y to all elements 605 T getY(SIMDVer Ver = simdVer, T)(inout T v) if(isSIMDVector!T && NumElements!T >= 2) 606 { 607 // broadcast the second component 608 return swizzle!("1", Ver)(v); 609 } 610 611 // broadcast Z to all elements 612 T getZ(SIMDVer Ver = simdVer, T)(inout T v) if(isSIMDVector!T && NumElements!T >= 3) 613 { 614 // broadcast the 3nd component 615 return swizzle!("2", Ver)(v); 616 } 617 618 // broadcast W to all elements 619 T getW(SIMDVer Ver = simdVer, T)(inout T v) if(isSIMDVector!T && NumElements!T >= 4) 620 { 621 // broadcast the 4th component 622 return swizzle!("3", Ver)(v); 623 } 624 625 // set the X element 626 T setX(SIMDVer Ver = simdVer, T)(inout T v, inout T x) if(isSIMDVector!T) 627 { 628 version(X86_OR_X64) 629 { 630 version(DigitalMars) 631 { 632 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 633 { 634 static if(isOfType!(T, double2)) 635 return __simd(XMM.BLENDPD, v, x, 1); 636 else static if(isOfType!(T, float4)) 637 return __simd(XMM.BLENDPS, v, x, 1); 638 else static if(is64bitElement!T) 639 return __simd(XMM.PBLENDW, v, x, 0x0F); 640 else static if(is32bitElement!T) 641 return __simd(XMM.PBLENDW, v, x, 0x03); 642 else static if(is16bitElement!T) 643 return __simd(XMM.PBLENDW, v, x, 0x01); 644 } 645 else 646 static assert(0, "Unsupported vector type: " ~ T.stringof); 647 } 648 else version(GNU) 649 { 650 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 651 { 652 static if(isOfType!(T, double2)) 653 return __builtin_ia32_blendpd(v, x, 1); 654 else static if(isOfType!(T, float4)) 655 return __builtin_ia32_blendps(v, x, 1); 656 else static if(is64bitElement!T) 657 return __builtin_ia32_pblendw128(v, x, 0x0F); 658 else static if(is32bitElement!T) 659 return __builtin_ia32_pblendw128(v, x, 0x03); 660 else static if(is16bitElement!T) 661 return __builtin_ia32_pblendw128(v, x, 0x01); 662 } 663 else 664 static assert(0, "Unsupported vector type: " ~ T.stringof); 665 } 666 else version(LDC) 667 { 668 enum int n = NumElements!T; 669 return ldcsimd.shufflevector!(T, n, staticIota!(1, n))(v, x); 670 } 671 } 672 else version(ARM) 673 { 674 static assert(0, "TODO"); 675 } 676 else 677 { 678 static assert(0, "Unsupported on this architecture"); 679 } 680 } 681 682 // set the Y element 683 T setY(SIMDVer Ver = simdVer, T)(inout T v, inout T y) if(isSIMDVector!T) 684 { 685 version(X86_OR_X64) 686 { 687 version(DigitalMars) 688 { 689 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 690 { 691 static if(isOfType!(T, double2)) 692 return __simd(XMM.BLENDPD, v, x, 2); 693 else static if(isOfType!(T, float4)) 694 return __simd(XMM.BLENDPS, v, x, 2); 695 else static if(is64bitElement!T) 696 return __simd(XMM.PBLENDW, v, x, 0xF0); 697 else static if(is32bitElement!T) 698 return __simd(XMM.PBLENDW, v, x, 0x0C); 699 else static if(is16bitElement!T) 700 return __simd(XMM.PBLENDW, v, x, 0x02); 701 } 702 else 703 static assert(0, "Unsupported vector type: " ~ T.stringof); 704 } 705 else version(GNU) 706 { 707 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 708 { 709 static if(isOfType!(T, double2)) 710 return __builtin_ia32_blendpd(v, y, 2); 711 else static if(isOfType!(T, float4)) 712 return __builtin_ia32_blendps(v, y, 2); 713 else static if(is64bitElement!T) 714 return __builtin_ia32_pblendw128(v, y, 0xF0); 715 else static if(is32bitElement!T) 716 return __builtin_ia32_pblendw128(v, y, 0x0C); 717 else static if(is16bitElement!T) 718 return __builtin_ia32_pblendw128(v, y, 0x02); 719 } 720 else 721 static assert(0, "Unsupported vector type: " ~ T.stringof); 722 } 723 else version(LDC) 724 { 725 enum int n = NumElements!T; 726 static assert(n >= 2); 727 return ldcsimd.shufflevector!(T, 0, n + 1, staticIota!(2, n))(v, y); 728 } 729 } 730 else version(ARM) 731 { 732 static assert(0, "TODO"); 733 } 734 else 735 { 736 static assert(0, "Unsupported on this architecture"); 737 } 738 } 739 740 // set the Z element 741 T setZ(SIMDVer Ver = simdVer, T)(inout T v, inout T z) if(isSIMDVector!T) 742 { 743 version(X86_OR_X64) 744 { 745 version(DigitalMars) 746 { 747 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 748 { 749 static if(isOfType!(T, float4)) 750 return __simd(XMM.BLENDPS, v, x, 4); 751 else static if(is32bitElement!T) 752 return __simd(XMM.PBLENDW, v, x, 0x30); 753 else static if(is16bitElement!T) 754 return __simd(XMM.PBLENDW, v, x, 0x04); 755 else 756 static assert(0, "Unsupported vector type: " ~ T.stringof); 757 } 758 else 759 static assert(0, "Unsupported vector type: " ~ T.stringof); 760 } 761 else version(GNU) 762 { 763 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 764 { 765 static if(isOfType!(T, float4)) 766 return __builtin_ia32_blendps(v, z, 4); 767 else static if(is32bitElement!T) 768 return __builtin_ia32_pblendw128(v, z, 0x30); 769 else static if(is16bitElement!T) 770 return __builtin_ia32_pblendw128(v, z, 0x04); 771 else 772 static assert(0, "Unsupported vector type: " ~ T.stringof); 773 } 774 else 775 static assert(0, "Unsupported vector type: " ~ T.stringof); 776 } 777 else version(LDC) 778 { 779 enum int n = NumElements!T; 780 static assert(n >= 3); 781 return ldcsimd.shufflevector!(T, 0, 1, n + 2, staticIota!(3, n))(v, z); 782 } 783 } 784 else version(ARM) 785 { 786 static assert(0, "TODO"); 787 } 788 else 789 { 790 static assert(0, "Unsupported on this architecture"); 791 } 792 } 793 794 // set the W element 795 T setW(SIMDVer Ver = simdVer, T)(inout T v, inout T w) if(isSIMDVector!T) 796 { 797 version(X86_OR_X64) 798 { 799 version(DigitalMars) 800 { 801 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 802 { 803 static if(isOfType!(T, float4)) 804 return __simd(XMM.BLENDPS, v, x, 8); 805 else static if(is32bitElement!T) 806 return __simd(XMM.PBLENDW, v, x, 0xC0); 807 else static if(is16bitElement!T) 808 return __simd(XMM.PBLENDW, v, x, 0x08); 809 else 810 static assert(0, "Unsupported vector type: " ~ T.stringof); 811 } 812 else 813 static assert(0, "Unsupported vector type: " ~ T.stringof); 814 } 815 else version(GNU) 816 { 817 static if(Ver >= SIMDVer.SSE41 && !is8bitElement!T) 818 { 819 static if(isOfType!(T, float4)) 820 return __builtin_ia32_blendps(v, w, 8); 821 else static if(is32bitElement!T) 822 return __builtin_ia32_pblendw128(v, w, 0xC0); 823 else static if(is16bitElement!T) 824 return __builtin_ia32_pblendw128(v, w, 0x08); 825 else 826 static assert(0, "Unsupported vector type: " ~ T.stringof); 827 } 828 else 829 static assert(0, "Unsupported vector type: " ~ T.stringof); 830 } 831 else version(LDC) 832 { 833 enum int n = NumElements!T; 834 static assert(n >= 4); 835 return ldcsimd.shufflevector!(T, 0, 1, 2, n + 3, staticIota!(4, n))(v, w); 836 } 837 } 838 else version(ARM) 839 { 840 static assert(0, "TODO"); 841 } 842 else 843 { 844 static assert(0, "Unsupported on this architecture"); 845 } 846 } 847 848 // swizzle a vector: r = swizzle!"ZZWX"(v); // r = v.zzwx 849 T swizzle(string swiz, SIMDVer Ver = simdVer, T)(inout T v) 850 { 851 // meta to extract the elements from a swizzle string 852 template getElements(string s, T) 853 { 854 // accepted element component names 855 template elementNames(int numElements) 856 { 857 static if(numElements == 2) 858 alias elementNames = TypeTuple!("01", "xy"); 859 else static if(numElements == 3) 860 alias elementNames = TypeTuple!("012", "xyz", "rgb"); 861 else static if(numElements == 4) 862 alias elementNames = TypeTuple!("0123", "xyzw", "rgba"); 863 else static if(numElements == 8) 864 alias elementNames = TypeTuple!("01234567"); 865 else static if(numElements == 16) 866 alias elementNames = TypeTuple!("0123456789abcdef"); 867 else 868 alias elementNames = TypeTuple!(); 869 } 870 871 enum char lower(char c) = c >= 'A' && c <= 'Z' ? c + 32 : c; 872 873 // get the component name set for a swizzle string 874 template Components(string s, names...) 875 { 876 template charIn(char c, string s) 877 { 878 static if(s.length == 0) 879 enum charIn = false; 880 else 881 enum charIn = lower!c == s[0] || charIn!(c, s[1..$]); 882 } 883 template allIn(string chars, string s) 884 { 885 static if(chars.length == 0) 886 enum allIn = true; 887 else 888 enum allIn = charIn!(chars[0], s) && allIn!(chars[1..$], s); 889 } 890 891 static if(s.length == 0 || names.length == 0) 892 enum string Components = null; 893 else static if(allIn!(s, names[0])) 894 enum Components = names[0]; 895 else 896 enum Components = Components!(s, names[1..$]); 897 } 898 899 // used to find the element id of a compoment 900 template Offset(char c, string elements, int i = 0) 901 { 902 static if(i == elements.length) 903 enum Offset = -1; 904 else static if(lower!c == elements[i]) 905 enum Offset = i; 906 else 907 enum Offset = Offset!(c, elements, i+1); 908 } 909 910 // parse the swizzle string 911 template Parse(string chars, string elements) 912 { 913 static if(chars.length == 0 || elements.length == 0) 914 alias Parse = TypeTuple!(); 915 else 916 alias Parse = TypeTuple!(Offset!(chars[0], elements), Parse!(chars[1..$], elements)); 917 } 918 919 alias getElements = Parse!(s, Components!(s, elementNames!(NumElements!T))); 920 } 921 922 // repeat an element to form a broadcast 923 template broadcast(size_t element, size_t count) 924 { 925 static if(element == -1 || count == 0) 926 alias broadcast = TypeTuple!(); 927 else 928 alias broadcast = TypeTuple!(element, broadcast!(element, count-1)); 929 } 930 931 template isIdentity(E...) 932 { 933 template Impl(size_t i) 934 { 935 static if(i == E.length) 936 enum Impl = true; 937 else 938 enum Impl = E[i] == i && Impl!(i+1); 939 } 940 enum isIdentity = Impl!0; 941 } 942 template isBroadcast(E...) 943 { 944 template Impl(size_t i) 945 { 946 static if(i == E.length) 947 enum Impl = true; 948 else 949 enum Impl = E[i] == E[i-1] && Impl!(i+1); 950 } 951 enum isBroadcast = Impl!1; 952 } 953 954 enum numElements = NumElements!T; 955 956 // get the swizzle elements 957 alias el = getElements!(swiz, T); 958 959 static assert(el.length > 0, "Invalid swizzle string: '" ~ swiz ~ "'"); 960 961 // support broadcasting 962 static if(el.length == 1) 963 alias elements = broadcast!(el[0], numElements); 964 else 965 alias elements = el; 966 967 // TODO: if there are fewer elements in the string than the type, should we padd with identity swizzle? ie, "yyx" -> "yyxw" 968 static assert(elements.length == numElements, "Invalid number of components in swizzle string '" ~ swiz ~ "' for type " ~ T.stringof); 969 970 static if(isIdentity!elements) 971 { 972 // early out if no swizzle took place 973 return v; 974 } 975 else 976 { 977 version(X86_OR_X64) 978 { 979 version(DigitalMars) 980 { 981 // broadcasts can usually be implemented more efficiently... 982 static if(isBroadcast!elements && !is32bitElement!T) 983 { 984 static if(isOfType!(T, double2)) 985 { 986 // unpacks are more efficient than shuffd 987 static if(elements[0] == 0) 988 { 989 static if(0)//Ver >= SIMDVer.SSE3) // TODO: *** WHY DOESN'T THIS WORK?! 990 return __simd(XMM.MOVDDUP, v); 991 else 992 return __simd(XMM.UNPCKLPD, v, v); 993 } 994 else 995 return __simd(XMM.UNPCKHPD, v, v); 996 } 997 else static if(is64bitElement!(T)) // (u)long2 998 { 999 // unpacks are more efficient than shuffd 1000 static if(elements[0] == 0) 1001 return __simd(XMM.PUNPCKLQDQ, v, v); 1002 else 1003 return __simd(XMM.PUNPCKHQDQ, v, v); 1004 } 1005 else static if(is16bitElement!T) 1006 { 1007 // TODO: we should use permute to perform this operation when immediates work >_< 1008 static if(false)// Ver >= SIMDVer.SSSE3) 1009 { 1010 // immutable ubyte16 permuteControl = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]; 1011 // return __simd(XMM.PSHUFB, v, permuteControl); 1012 } 1013 else 1014 { 1015 // TODO: is this most efficient? 1016 // No it is not... we should use a single shuflw/shufhw followed by a 64bit unpack... 1017 enum int[] shufValues = [0x00, 0x55, 0xAA, 0xFF]; 1018 T t = __simd(XMM.PSHUFD, v, v, shufValues[elements[0] >> 1]); 1019 t = __simd(XMM.PSHUFLW, t, t, (elements[0] & 1) ? 0x55 : 0x00); 1020 return __simd(XMM.PSHUFHW, t, t, (elements[0] & 1) ? 0x55 : 0x00); 1021 } 1022 } 1023 else static if(is8bitElement!T) 1024 { 1025 static if(Ver >= SIMDVer.SSSE3) 1026 { 1027 static if(elements[0] == 0) 1028 immutable ubyte16 permuteControl = __simd(XMM.XORPS, v, v); // generate a zero register 1029 else 1030 immutable ubyte16 permuteControl = cast(ubyte)elements[0]; // load a permute constant 1031 return __simd(XMM.PSHUFB, v, permuteControl); 1032 } 1033 else 1034 static assert(0, "Only supported in SSSE3 and above"); 1035 } 1036 else 1037 static assert(0, "Unsupported vector type: " ~ T.stringof); 1038 } 1039 else 1040 { 1041 static if(isOfType!(T, double2)) 1042 return __simd(XMM.SHUFPD, v, v, shufMask!(elements)); // swizzle: YX 1043 else static if(is64bitElement!(T)) // (u)long2 1044 // use a 32bit integer shuffle for swizzle: YZ 1045 return __simd(XMM.PSHUFD, v, v, shufMask!(elements[0]*2, elements[0]*2 + 1, elements[1]*2, elements[1]*2 + 1)); 1046 else static if(isOfType!(T, float4)) 1047 { 1048 static if(elements == TypeTuple!(0,0,2,2) && Ver >= SIMDVer.SSE3) 1049 return __simd(XMM.MOVSLDUP, v); 1050 else static if(elements == TypeTuple!(1,1,3,3) && Ver >= SIMDVer.SSE3) 1051 return __simd(XMM.MOVSHDUP, v); 1052 else 1053 return __simd(XMM.SHUFPS, v, v, shufMask!(elements)); 1054 } 1055 else static if(is32bitElement!(T)) 1056 return __simd(XMM.PSHUFD, v, v, shufMask!(elements)); 1057 else static if(is8bitElement!T || is16bitElement!T) 1058 { 1059 static if(Ver >= SIMDVer.SSSE3) 1060 { 1061 // static ubyte[16] mask = [pshufbMask!elements]; 1062 // auto vmask = cast(ubyte16) __simd(XMM.LOADDQU, cast(char*) mask.ptr); 1063 // XMM.LOADDQU does not exist, and I don't know of anything equivalent in DMD. 1064 // this compiles (I hope ther aren't any alignment issues): 1065 __gshared static ubyte16 vmask = [pshufbMask!elements]; 1066 return cast(T) __simd(XMM.PSHUFB, cast(ubyte16) v, vmask); 1067 } 1068 else 1069 static assert(0, "Only supported in SSSE3 and above"); 1070 } 1071 else 1072 { 1073 // TODO: 16 and 8bit swizzles... 1074 static assert(0, "Unsupported vector type: " ~ T.stringof); 1075 } 1076 } 1077 } 1078 else version(GNU) 1079 { 1080 // broadcasts can usually be implemented more efficiently... 1081 static if(isBroadcast!elements && !is32bitElement!T) 1082 { 1083 static if(isOfType!(T, double2)) 1084 { 1085 // unpacks are more efficient than shuffd 1086 static if(elements[0] == 0) 1087 { 1088 static if(0)//Ver >= SIMDVer.SSE3) // TODO: *** WHY DOESN'T THIS WORK?! 1089 return __builtin_ia32_movddup(v); 1090 else 1091 return __builtin_ia32_unpcklpd(v, v); 1092 } 1093 else 1094 return __builtin_ia32_unpckhpd(v, v); 1095 } 1096 else static if(is64bitElement!(T)) // (u)long2 1097 { 1098 // unpacks are more efficient than shuffd 1099 static if(elements[0] == 0) 1100 return __builtin_ia32_punpcklqdq128(v, v); 1101 else 1102 return __builtin_ia32_punpckhqdq128(v, v); 1103 } 1104 else static if(is16bitElement!T) 1105 { 1106 // TODO: we should use permute to perform this operation when immediates work >_< 1107 static if(false)// Ver >= SIMDVer.SSSE3) 1108 { 1109 // immutable ubyte16 permuteControl = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]; 1110 // return __builtin_ia32_pshufb128(v, permuteControl); 1111 } 1112 else 1113 { 1114 // TODO: is this most efficient? 1115 // No it is not... we should use a single shuflw/shufhw followed by a 64bit unpack... 1116 enum int[] shufValues = [0x00, 0x55, 0xAA, 0xFF]; 1117 T t = __builtin_ia32_pshufd(v, shufValues[elements[0] >> 1]); 1118 t = __builtin_ia32_pshuflw(t, (elements[0] & 1) ? 0x55 : 0x00); 1119 return __builtin_ia32_pshufhw(t, (elements[0] & 1) ? 0x55 : 0x00); 1120 } 1121 } 1122 else static if(is8bitElement!T) 1123 { 1124 static if(Ver >= SIMDVer.SSSE3) 1125 { 1126 static if(elements[0] == 0) 1127 immutable ubyte16 permuteControl = __builtin_ia32_xorps(v, v); // generate a zero register 1128 else 1129 immutable ubyte16 permuteControl = cast(ubyte)elements[0]; // load a permute constant 1130 return __builtin_ia32_pshufb128(v, permuteControl); 1131 } 1132 else 1133 static assert(0, "Only supported in SSSE3 and above"); 1134 } 1135 else 1136 static assert(0, "Unsupported vector type: " ~ T.stringof); 1137 } 1138 else 1139 { 1140 static if(isOfType!(T, double2)) 1141 return __builtin_ia32_shufpd(v, v, shufMask!(elements)); // swizzle: YX 1142 else static if(is64bitElement!(T)) // (u)long2 1143 // use a 32bit integer shuffle for swizzle: YZ 1144 return __builtin_ia32_pshufd(v, shufMask!(elements[0]*2, elements[0]*2 + 1, elements[1]*2, elements[1]*2 + 1)); 1145 else static if(isOfType!(T, float4)) 1146 { 1147 static if(elements == TypeTuple!(0,0,2,2) && Ver >= SIMDVer.SSE3) 1148 return __builtin_ia32_movsldup(v); 1149 else static if(elements == TypeTuple!(1,1,3,3) && Ver >= SIMDVer.SSE3) 1150 return __builtin_ia32_movshdup(v); 1151 else 1152 return __builtin_ia32_shufps(v, v, shufMask!(elements)); 1153 } 1154 else static if(is32bitElement!(T)) 1155 return __builtin_ia32_pshufd(v, shufMask!(elements)); 1156 else static if(is8bitElement!T || is16bitElement!T) 1157 { 1158 static if(Ver >= SIMDVer.SSSE3) 1159 { 1160 static immutable ubyte[16] mask = [pshufbMask!elements]; 1161 auto vmask = cast(ubyte16) __builtin_ia32_loaddqu(cast(char*) mask.ptr); 1162 return cast(T) __builtin_ia32_pshufb128(cast(ubyte16) v, vmask); 1163 } 1164 else 1165 static assert(0, "Only supported in SSSE3 and above"); 1166 } 1167 else 1168 { 1169 // TODO: 16 and 8bit swizzles... 1170 static assert(0, "Unsupported vector type: " ~ T.stringof); 1171 } 1172 } 1173 } 1174 else version(LDC) 1175 { 1176 return ldcsimd.shufflevector!(T, toTypeTuple!elements)(v, v); 1177 } 1178 } 1179 else version(ARM) 1180 { 1181 static assert(0, "TODO"); 1182 } 1183 else 1184 { 1185 static assert(0, "Unsupported on this architecture"); 1186 } 1187 } 1188 } 1189 1190 // assign bytes to the target according to a permute control register 1191 T permute(SIMDVer Ver = simdVer, T)(inout T v, ubyte16 control) 1192 { 1193 version(X86_OR_X64) 1194 { 1195 version(DigitalMars) 1196 { 1197 static if(Ver >= SIMDVer.SSE3) 1198 return __simd(XMM.PSHUFB, v, control); 1199 else 1200 static assert(0, "Only supported in SSSE3 and above"); 1201 } 1202 else version(GNU_OR_LDC) 1203 { 1204 static if(Ver >= SIMDVer.SSE3) 1205 return cast(T)__builtin_ia32_pshufb128(cast(ubyte16)v, control); 1206 else 1207 static assert(0, "Only supported in SSSE3 and above"); 1208 } 1209 } 1210 else version(ARM) 1211 { 1212 static assert(0, "TODO"); 1213 } 1214 else 1215 { 1216 static assert(0, "Unsupported on this architecture"); 1217 } 1218 } 1219 1220 // interleave low elements from 2 vectors 1221 T interleaveLow(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 1222 { 1223 // this really requires multiple return values >_< 1224 1225 version(X86_OR_X64) 1226 { 1227 version(DigitalMars) 1228 { 1229 static if(isOfType!(T, float4)) 1230 return __simd(XMM.UNPCKLPS, v1, v2); 1231 else static if(isOfType!(T, double2)) 1232 return __simd(XMM.UNPCKLPD, v1, v2); 1233 else static if(is64bitElement!T) 1234 return __simd(XMM.PUNPCKLQDQ, v1, v2); 1235 else static if(is32bitElement!T) 1236 return __simd(XMM.PUNPCKLDQ, v1, v2); 1237 else static if(is16bitElement!T) 1238 return __simd(XMM.PUNPCKLWD, v1, v2); 1239 else static if(is8bitElement!T) 1240 return __simd(XMM.PUNPCKLBW, v1, v2); 1241 else 1242 static assert(0, "Unsupported vector type: " ~ T.stringof); 1243 } 1244 else version(GNU) 1245 { 1246 static if(isOfType!(T, float4)) 1247 return __builtin_ia32_unpcklps(v1, v2); 1248 else static if(isOfType!(T, double2)) 1249 return __builtin_ia32_unpcklpd(v1, v2); 1250 else static if(is64bitElement!T) 1251 return __builtin_ia32_punpcklqdq128(v1, v2); 1252 else static if(is32bitElement!T) 1253 return __builtin_ia32_punpckldq128(v1, v2); 1254 else static if(is16bitElement!T) 1255 return __builtin_ia32_punpcklwd128(v1, v2); 1256 else static if(is8bitElement!T) 1257 return __builtin_ia32_punpcklbw128(v1, v2); 1258 else 1259 static assert(0, "Unsupported vector type: " ~ T.stringof); 1260 } 1261 else version(LDC) 1262 { 1263 enum int n = NumElements!T; 1264 alias interleaveTuples!(staticIota!(0, n / 2), staticIota!(n, n + n / 2)) mask; 1265 return ldcsimd.shufflevector!(T, mask)(v1, v2); 1266 } 1267 } 1268 else version(ARM) 1269 { 1270 static assert(0, "TODO"); 1271 } 1272 else 1273 { 1274 static assert(0, "Unsupported on this architecture"); 1275 } 1276 } 1277 1278 // interleave high elements from 2 vectors 1279 T interleaveHigh(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 1280 { 1281 // this really requires multiple return values >_< 1282 1283 version(X86_OR_X64) 1284 { 1285 version(DigitalMars) 1286 { 1287 static if(isOfType!(T, float4)) 1288 return __simd(XMM.UNPCKHPS, v1, v2); 1289 else static if(isOfType!(T, double2)) 1290 return __simd(XMM.UNPCKHPD, v1, v2); 1291 else static if(is64bitElement!T) 1292 return __simd(XMM.PUNPCKHQDQ, v1, v2); 1293 else static if(is32bitElement!T) 1294 return __simd(XMM.PUNPCKHDQ, v1, v2); 1295 else static if(is16bitElement!T) 1296 return __simd(XMM.PUNPCKHWD, v1, v2); 1297 else static if(is8bitElement!T) 1298 return __simd(XMM.PUNPCKHBW, v1, v2); 1299 else 1300 static assert(0, "Unsupported vector type: " ~ T.stringof); 1301 } 1302 else version(GNU) 1303 { 1304 static if(isOfType!(T, float4)) 1305 return __builtin_ia32_unpckhps(v1, v2); 1306 else static if(isOfType!(T, double2)) 1307 return __builtin_ia32_unpckhpd(v1, v2); 1308 else static if(is64bitElement!T) 1309 return __builtin_ia32_punpckhqdq128(v1, v2); 1310 else static if(is32bitElement!T) 1311 return __builtin_ia32_punpckhdq128(v1, v2); 1312 else static if(is16bitElement!T) 1313 return __builtin_ia32_punpckhwd128(v1, v2); 1314 else static if(is8bitElement!T) 1315 return __builtin_ia32_punpckhbw128(v1, v2); 1316 else 1317 static assert(0, "Unsupported vector type: " ~ T.stringof); 1318 } 1319 else version(LDC) 1320 { 1321 enum int n = NumElements!T; 1322 alias interleaveTuples!(staticIota!(n / 2, n), staticIota!(n + n / 2, n + n)) mask; 1323 1324 return ldcsimd.shufflevector!(T, mask)(v1, v2); 1325 } 1326 } 1327 else version(ARM) 1328 { 1329 static assert(0, "TODO"); 1330 } 1331 else 1332 { 1333 static assert(0, "Unsupported on this architecture"); 1334 } 1335 } 1336 1337 //... there are many more useful permutation ops 1338 1339 1340 1341 /////////////////////////////////////////////////////////////////////////////// 1342 // Pack/unpack 1343 1344 // these are PERFECT examples of functions that would benefit from multiple return values! 1345 /* eg. 1346 short8,short8 unpackBytes(byte16) 1347 { 1348 short8 low,high; 1349 low = bytes[0..4]; 1350 high = bytes[4..8]; 1351 return low,high; 1352 } 1353 */ 1354 1355 PromotionOf!T unpackLow(SIMDVer Ver = simdVer, T)(inout T v) 1356 { 1357 version(X86_OR_X64) 1358 { 1359 static if(isOfType!(T, float4)) 1360 return cast(PromotopnOf!T)toDouble!Ver(v); 1361 else static if(isOfType!(T, int4)) 1362 return cast(PromotionOf!T)interleaveLow!Ver(v, shiftRightImmediate!(31, Ver)(v)); 1363 else static if(isOfType!(T, uint4)) 1364 return cast(PromotionOf!T)interleaveLow!(Ver, T)(v, 0); 1365 else static if(isOfType!(T, short8)) 1366 return shiftRightImmediate!(16, Ver)(cast(int4)interleaveLow!Ver(v, v)); 1367 else static if(isOfType!(T, ushort8)) 1368 return cast(PromotionOf!T)interleaveLow!(Ver, T)(v, 0); 1369 else static if(isOfType!(T, byte16)) 1370 return shiftRightImmediate!(8, Ver)(cast(short8)interleaveLow!Ver(v, v)); 1371 else static if(isOfType!(T, ubyte16)) 1372 return cast(PromotionOf!T)interleaveLow!(Ver, T)(v, 0); 1373 else 1374 static assert(0, "Unsupported vector type: " ~ T.stringof); 1375 } 1376 else version(ARM) 1377 { 1378 static assert(0, "TODO"); 1379 } 1380 else 1381 { 1382 static assert(0, "Unsupported on this architecture"); 1383 } 1384 } 1385 1386 PromotionOf!T unpackHigh(SIMDVer Ver = simdVer, T)(inout T v) 1387 { 1388 version(X86_OR_X64) 1389 { 1390 static if(isOfType!(T, float4)) 1391 return toDouble!Ver(swizzle!("zwzw", Ver)(v)); 1392 else static if(isOfType!(T, int4)) 1393 return cast(PromotionOf!T)interleaveHigh!Ver(v, shiftRightImmediate!(31, Ver)(v)); 1394 else static if(isOfType!(T, uint4)) 1395 return cast(PromotionOf!T)interleaveHigh!(Ver, T)(v, 0); 1396 else static if(isOfType!(T, short8)) 1397 return shiftRightImmediate!(16, Ver)(cast(int4)interleaveHigh!Ver(v, v)); 1398 else static if(isOfType!(T, ushort8)) 1399 return cast(PromotionOf!T)interleaveHigh!(Ver, T)(v, 0); 1400 else static if(isOfType!(T, byte16)) 1401 return shiftRightImmediate!(8, Ver)(cast(short8)interleaveHigh!Ver(v, v)); 1402 else static if(isOfType!(T, ubyte16)) 1403 return cast(PromotionOf!T)interleaveHigh!(Ver, T)(v, 0); 1404 else 1405 static assert(0, "Unsupported vector type: " ~ T.stringof); 1406 } 1407 else version(ARM) 1408 { 1409 static assert(0, "TODO"); 1410 } 1411 else 1412 { 1413 static assert(0, "Unsupported on this architecture"); 1414 } 1415 } 1416 1417 DemotionOf!T pack(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 1418 { 1419 version(X86_OR_X64) 1420 { 1421 static if(isOfType!(T, double2)) 1422 return interleaveLow!Ver(toFloat!Ver(v1), toFloat!Ver(v2)); 1423 else 1424 { 1425 version(DigitalMars) 1426 { 1427 static if(isOfType!(T, long2)) 1428 static assert(0, "TODO"); 1429 else static if(isOfType!(T, ulong2)) 1430 static assert(0, "TODO"); 1431 else static if(isOfType!(T, int4)) 1432 static assert(0, "TODO"); 1433 else static if(isOfType!(T, uint4)) 1434 static assert(0, "TODO"); 1435 else static if(is16bitElement!T) 1436 { 1437 // return _mm_packus_epi16(_mm_and_si128(v1, 0x00FF), _mm_and_si128(v2, 0x00FF)); 1438 return __simd(XMM.PACKUSWB, v1, v2); 1439 } 1440 else 1441 static assert(0, "Unsupported vector type: " ~ T.stringof); 1442 } 1443 else version(GNU) 1444 { 1445 static if(isOfType!(T, long2)) 1446 static assert(0, "TODO"); 1447 else static if(isOfType!(T, ulong2)) 1448 static assert(0, "TODO"); 1449 else static if(isOfType!(T, int4)) 1450 { 1451 static assert(0, "TODO"); 1452 // return _mm_packs_epi32( _mm_srai_epi32( _mm_slli_epi16( a, 16), 16), _mm_srai_epi32( _mm_slli_epi32( b, 16), 16) ); 1453 } 1454 else static if(isOfType!(T, uint4)) 1455 { 1456 static assert(0, "TODO"); 1457 // return _mm_packs_epi32( _mm_srai_epi32( _mm_slli_epi32( a, 16), 16), _mm_srai_epi32( _mm_slli_epi32( b, 16), 16) ); 1458 } 1459 else static if(is16bitElement!T) 1460 static assert(0, "TODO"); 1461 // return _mm_packus_epi16(_mm_and_si128(v1, 0x00FF), _mm_and_si128(v2, 0x00FF)); 1462 else 1463 static assert(0, "Unsupported vector type: " ~ T.stringof); 1464 } 1465 else version(LDC) 1466 { 1467 alias DemotionOf!T D; 1468 enum int n = NumElements!D; 1469 1470 return ldcsimd.shufflevector!(D, staticIota!(0, 2 * n, 2))(cast(D) v1, cast(D) v2); 1471 } 1472 } 1473 } 1474 else version(ARM) 1475 { 1476 static assert(0, "TODO"); 1477 } 1478 else 1479 { 1480 static assert(0, "Unsupported on this architecture"); 1481 } 1482 } 1483 1484 DemotionOf!T packSaturate(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 1485 { 1486 version(X86_OR_X64) 1487 { 1488 version(DigitalMars) 1489 { 1490 static if(isOfType!(T, int4)) 1491 return __simd(XMM.PACKSSDW, v1, v2); 1492 else static if(isOfType!(T, uint4)) 1493 static assert(0, "TODO: should we emulate this?"); 1494 else static if(isOfType!(T, short8)) 1495 return __simd(XMM.PACKSSWB, v1, v2); 1496 else static if(isOfType!(T, ushort8)) 1497 return __simd(XMM.PACKUSWB, v1, v2); 1498 else 1499 static assert(0, "Unsupported vector type: " ~ T.stringof); 1500 } 1501 else version(GNU_OR_LDC) 1502 { 1503 static if(isOfType!(T, int4)) 1504 return __builtin_ia32_packssdw128(v1, v2); 1505 else static if(isOfType!(T, uint4)) 1506 static assert(0, "TODO: should we emulate this?"); 1507 else static if(isOfType!(T, short8)) 1508 return __builtin_ia32_packsswb128(v1, v2); 1509 else static if(isOfType!(T, ushort8)) 1510 return __builtin_ia32_packuswb128(v1, v2); 1511 else 1512 static assert(0, "Unsupported vector type: " ~ T.stringof); 1513 } 1514 } 1515 else version(ARM) 1516 { 1517 static assert(0, "TODO"); 1518 } 1519 else 1520 { 1521 static assert(0, "Unsupported on this architecture"); 1522 } 1523 } 1524 1525 /////////////////////////////////////////////////////////////////////////////// 1526 // Type conversion 1527 1528 int4 toInt(SIMDVer Ver = simdVer, T)(inout T v) 1529 { 1530 static if(isOfType!(T, int4)) 1531 return v; 1532 else 1533 { 1534 version(X86_OR_X64) 1535 { 1536 version(DigitalMars) 1537 { 1538 static if(isOfType!(T, float4)) 1539 return __simd(XMM.CVTPS2DQ, v); 1540 else static if(isOfType!(T, double2)) 1541 return __simd(XMM.CVTPD2DQ, v); // TODO: z,w are undefined... should we repeat xy to zw? 1542 else 1543 static assert(0, "Unsupported vector type: " ~ T.stringof); 1544 } 1545 else version(GNU_OR_LDC) 1546 { 1547 static if(isOfType!(T, float4)) 1548 return __builtin_ia32_cvtps2dq(v); 1549 else static if(isOfType!(T, double2)) 1550 return __builtin_ia32_cvtpd2dq(v); // TODO: z,w are undefined... should we repeat xy to zw? 1551 else 1552 static assert(0, "Unsupported vector type: " ~ T.stringof); 1553 } 1554 } 1555 else version(ARM) 1556 { 1557 static assert(0, "TODO"); 1558 } 1559 else 1560 { 1561 static assert(0, "Unsupported on this architecture"); 1562 } 1563 } 1564 } 1565 1566 float4 toFloat(SIMDVer Ver = simdVer, T)(inout T v) 1567 { 1568 static if(isOfType!(T, float4)) 1569 return v; 1570 else 1571 { 1572 version(X86_OR_X64) 1573 { 1574 version(DigitalMars) 1575 { 1576 static if(isOfType!(T, int4)) 1577 return __simd(XMM.CVTDQ2PS, v); 1578 else static if(isOfType!(T, double2)) 1579 return __simd(XMM.CVTPD2PS, v); 1580 else 1581 static assert(0, "Unsupported vector type: " ~ T.stringof); 1582 } 1583 else version(GNU_OR_LDC) 1584 { 1585 static if(isOfType!(T, int4)) 1586 return __builtin_ia32_cvtdq2ps(v); 1587 else static if(isOfType!(T, double2)) 1588 return __builtin_ia32_cvtpd2ps(v); 1589 else 1590 static assert(0, "Unsupported vector type: " ~ T.stringof); 1591 } 1592 } 1593 else version(ARM) 1594 { 1595 static assert(0, "TODO"); 1596 } 1597 else 1598 { 1599 static assert(0, "Unsupported on this architecture"); 1600 } 1601 } 1602 } 1603 1604 double2 toDouble(SIMDVer Ver = simdVer, T)(inout T v) 1605 { 1606 static if(isOfType!(T, double2)) 1607 return v; 1608 else 1609 { 1610 version(X86_OR_X64) 1611 { 1612 version(DigitalMars) 1613 { 1614 static if(isOfType!(T, int4)) 1615 return __simd(XMM.CVTDQ2PD, v); 1616 else static if(isOfType!(T, float4)) 1617 return __simd(XMM.CVTPS2PD, v); 1618 else 1619 static assert(0, "Unsupported vector type: " ~ T.stringof); 1620 } 1621 else version(GNU_OR_LDC) 1622 { 1623 static if(isOfType!(T, int4)) 1624 return __builtin_ia32_cvtdq2pd(v); 1625 else static if(isOfType!(T, float4)) 1626 return __builtin_ia32_cvtps2pd(v); 1627 else 1628 static assert(0, "Unsupported vector type: " ~ T.stringof); 1629 } 1630 } 1631 else version(ARM) 1632 { 1633 static assert(0, "TODO"); 1634 } 1635 else 1636 { 1637 static assert(0, "Unsupported on this architecture"); 1638 } 1639 } 1640 } 1641 1642 /////////////////////////////////////////////////////////////////////////////// 1643 // Basic mathematical operations 1644 1645 // unary absolute 1646 1647 template abs(SIMDVer Ver = simdVer, T) 1648 { 1649 @attribute("target", targetNames[Ver]) 1650 T abs(inout T v) 1651 { 1652 /****************************** 1653 * integer abs with no branches 1654 * mask = v >> numBits(v)-1; 1655 * r = (v + mask) ^ mask; 1656 ******************************/ 1657 1658 static if(isUnsigned!T) 1659 return v; 1660 else 1661 { 1662 version(X86_OR_X64) 1663 { 1664 version(DigitalMars) 1665 { 1666 static if(isOfType!(T, double2)) 1667 { 1668 return __simd(XMM.ANDNPD, cast(double2)signMask2, v); 1669 } 1670 else static if(isOfType!(T, float4)) 1671 { 1672 return __simd(XMM.ANDNPS, cast(float4)signMask4, v); 1673 } 1674 else static if(Ver >= SIMDVer.SSSE3) 1675 { 1676 static if(is64bitElement!(T)) 1677 static assert(0, "Unsupported: abs(" ~ T.stringof ~ "). Should we emulate?"); 1678 else static if(is32bitElement!(T)) 1679 return __simd(XMM.PABSD, v); 1680 else static if(is16bitElement!(T)) 1681 return __simd(XMM.PABSW, v); 1682 else static if(is8bitElement!(T4)) 1683 return __simd(XMM.PABSB, v); 1684 } 1685 else static if(isOfType!(T, int4)) 1686 { 1687 int4 t = shiftRightImmediate!(31, Ver)(v); 1688 return sub!Ver(xor!Ver(v, t), t); 1689 } 1690 else static if(isOfType!(T, short8)) 1691 { 1692 return max!Ver(v, neg!Ver(v)); 1693 } 1694 else static if(isOfType!(T, byte16)) 1695 { 1696 T zero = 0; 1697 byte16 t = maskGreater!Ver(zero, v); 1698 return sub!Ver(xor!Ver(v, t), t); 1699 } 1700 else 1701 static assert(0, "Unsupported vector type: " ~ T.stringof); 1702 } 1703 else version(GNU_OR_LDC) 1704 { 1705 static if(isOfType!(T, double2)) 1706 { 1707 version(GNU) 1708 return __builtin_ia32_andnpd(cast(double2)signMask2, v); 1709 else 1710 return cast(double2)(~signMask2 & cast(ulong2)v); 1711 } 1712 else static if(isOfType!(T, float4)) 1713 { 1714 version(GNU) 1715 return __builtin_ia32_andnps(cast(float4)signMask4, v); 1716 else 1717 return cast(float4)(~signMask4 & cast(uint4)v); 1718 } 1719 else static if(Ver >= SIMDVer.SSSE3 && !isOfType!(T, long2)) 1720 { 1721 static if(is32bitElement!(T)) 1722 return __builtin_ia32_pabsd128(v); 1723 else static if(is16bitElement!(T)) 1724 return __builtin_ia32_pabsw128(v); 1725 else static if(is8bitElement!(T)) 1726 return __builtin_ia32_pabsb128(v); 1727 } 1728 else static if(isOfType!(T, int4)) 1729 { 1730 int4 t = shiftRightImmediate!(31, Ver)(v); 1731 return sub!Ver(xor!Ver(v, t), t); 1732 } 1733 else static if(isOfType!(T, short8)) 1734 { 1735 return max!Ver(v, neg!Ver(v)); 1736 } 1737 else static if(isOfType!(T, byte16) || isOfType!(T, long2)) 1738 { 1739 T zero = 0; 1740 T t = maskGreater!Ver(zero, v); 1741 return sub!Ver(xor!Ver(v, t), t); 1742 } 1743 else 1744 static assert(0, "Unsupported vector type: " ~ T.stringof); 1745 } 1746 } 1747 else version(ARM) 1748 { 1749 static if(isOfType!(T, float4)) 1750 return __builtin_neon_vabsv4sf(v, ARMOpType!T); 1751 else static if(isOfType!(T, int4)) 1752 return __builtin_neon_vabsv4si(v, ARMOpType!T); 1753 else static if(isOfType!(T, short8)) 1754 return __builtin_neon_vabsv8hi(v, ARMOpType!T); 1755 else static if(isOfType!(T, byte16)) 1756 return __builtin_neon_vabsv16qi(v, ARMOpType!T); 1757 else 1758 static assert(0, "Unsupported vector type: " ~ T.stringof); 1759 } 1760 else 1761 { 1762 static assert(0, "Unsupported on this architecture"); 1763 } 1764 } 1765 } 1766 } 1767 1768 // unary negate 1769 template neg(SIMDVer Ver = simdVer, T) 1770 { 1771 @attribute("target", targetNames[Ver]) 1772 T neg(inout T v) 1773 { 1774 // D allows to negate unsigned values, so I guess we should support it in SIMD too 1775 // static assert(!isUnsigned!(T), "Can't negate unsigned value"); 1776 1777 version(X86_OR_X64) 1778 { 1779 return -v; 1780 } 1781 else version(ARM) 1782 { 1783 static if(isOfType!(T, float4)) 1784 return __builtin_neon_vnegv4sf(v, ARMOpType!T); 1785 else static if(isOfType!(T, int4)) 1786 return __builtin_neon_vnegv4si(v, ARMOpType!T); 1787 else static if(isOfType!(T, short8)) 1788 return __builtin_neon_vnegv8hi(v, ARMOpType!T); 1789 else static if(isOfType!(T, byte16)) 1790 return __builtin_neon_vnegv16qi(v, ARMOpType!T); 1791 else 1792 static assert(0, "Unsupported vector type: " ~ T.stringof); 1793 } 1794 else 1795 { 1796 static assert(0, "Unsupported on this architecture"); 1797 } 1798 } 1799 } 1800 1801 // binary add 1802 template add(SIMDVer Ver = simdVer, T) 1803 { 1804 @attribute("target", targetNames[Ver]) 1805 T add(inout T v1, inout T v2) 1806 { 1807 version(X86_OR_X64) 1808 { 1809 return v1 + v2; 1810 } 1811 else version(ARM) 1812 { 1813 static if(isOfType!(T, float4)) 1814 return __builtin_neon_vaddv4sf(v1, v2, ARMOpType!T); 1815 else static if(is64bitInteger!T) 1816 return __builtin_neon_vaddv2di(v1, v2, ARMOpType!T); 1817 else static if(is32bitElement!T) 1818 return __builtin_neon_vaddv4si(v1, v2, ARMOpType!T); 1819 else static if(is16bitElement!T) 1820 return __builtin_neon_vaddv8hi(v1, v2, ARMOpType!T); 1821 else static if(is8bitElement!T) 1822 return __builtin_neon_vaddv16qi(v1, v2, ARMOpType!T); 1823 else 1824 static assert(0, "Unsupported vector type: " ~ T.stringof); 1825 } 1826 else 1827 { 1828 static assert(0, "Unsupported on this architecture"); 1829 } 1830 } 1831 } 1832 1833 // binary add and saturate 1834 template addSaturate(SIMDVer Ver = simdVer) 1835 { 1836 @attribute("target", targetNames[Ver]) 1837 T addSaturate(T)(inout T v1, inout T v2) 1838 { 1839 version(X86_OR_X64) 1840 { 1841 version(DigitalMars) 1842 { 1843 static if(isOfType!(T, short8)) 1844 return __simd(XMM.PADDSW, v1, v2); 1845 else static if(isOfType!(T, ushort8)) 1846 return __simd(XMM.PADDUSW, v1, v2); 1847 else static if(isOfType!(T, byte16)) 1848 return __simd(XMM.PADDSB, v1, v2); 1849 else static if(isOfType!(T, ubyte16)) 1850 return __simd(XMM.PADDUSB, v1, v2); 1851 else 1852 static assert(0, "Unsupported vector type: " ~ T.stringof); 1853 } 1854 else version(GNU_OR_LDC) 1855 { 1856 static if(isOfType!(T, short8)) 1857 return __builtin_ia32_paddsw128(v1, v2); 1858 else static if(isOfType!(T, ushort8)) 1859 return __builtin_ia32_paddusw128(v1, v2); 1860 else static if(isOfType!(T, byte16)) 1861 return __builtin_ia32_paddsb128(v1, v2); 1862 else static if(isOfType!(T, ubyte16)) 1863 return __builtin_ia32_paddusb128(v1, v2); 1864 else 1865 static assert(0, "Unsupported vector type: " ~ T.stringof); 1866 } 1867 } 1868 else version(ARM) 1869 { 1870 static assert(0, "TODO"); 1871 } 1872 else 1873 { 1874 static assert(0, "Unsupported on this architecture"); 1875 } 1876 } 1877 } 1878 1879 // binary subtract 1880 template sub(SIMDVer Ver = simdVer, T) 1881 { 1882 @attribute("target", targetNames[Ver]) 1883 T sub(inout T v1, inout T v2) 1884 { 1885 version(X86_OR_X64) 1886 { 1887 return v1 - v2; 1888 } 1889 else version(ARM) 1890 { 1891 static if(isOfType!(T, float4)) 1892 return __builtin_neon_vsubv4sf(v1, v2, ARMOpType!T); 1893 else static if(is64bitInteger!T) 1894 return __builtin_neon_vsubv2di(v1, v2, ARMOpType!T); 1895 else static if(is32bitElement!T) 1896 return __builtin_neon_vsubv4si(v1, v2, ARMOpType!T); 1897 else static if(is16bitElement!T) 1898 return __builtin_neon_vsubv8hi(v1, v2, ARMOpType!T); 1899 else static if(is8bitElement!T) 1900 return __builtin_neon_vsubv16qi(v1, v2, ARMOpType!T); 1901 else 1902 static assert(0, "Unsupported vector type: " ~ T.stringof); 1903 } 1904 else 1905 { 1906 static assert(0, "Unsupported on this architecture"); 1907 } 1908 } 1909 } 1910 1911 // binary subtract and saturate 1912 template subSaturate(SIMDVer Ver = simdVer, T) 1913 { 1914 @attribute("target", targetNames[Ver]) 1915 T subSaturate(inout T v1, inout T v2) 1916 { 1917 version(X86_OR_X64) 1918 { 1919 version(DigitalMars) 1920 { 1921 static if(isOfType!(T, short8)) 1922 return __simd(XMM.PSUBSW, v1, v2); 1923 else static if(isOfType!(T, ushort8)) 1924 return __simd(XMM.PSUBUSW, v1, v2); 1925 else static if(isOfType!(T, byte16)) 1926 return __simd(XMM.PSUBSB, v1, v2); 1927 else static if(isOfType!(T, ubyte16)) 1928 return __simd(XMM.PSUBUSB, v1, v2); 1929 else 1930 static assert(0, "Unsupported vector type: " ~ T.stringof); 1931 } 1932 else version(GNU_OR_LDC) 1933 { 1934 static if(isOfType!(T, short8)) 1935 return __builtin_ia32_psubsw128(v1, v2); 1936 else static if(isOfType!(T, ushort8)) 1937 return __builtin_ia32_psubusw128(v1, v2); 1938 else static if(isOfType!(T, byte16)) 1939 return __builtin_ia32_psubsb128(v1, v2); 1940 else static if(isOfType!(T, ubyte16)) 1941 return __builtin_ia32_psubusb128(v1, v2); 1942 else 1943 static assert(0, "Unsupported vector type: " ~ T.stringof); 1944 } 1945 } 1946 else version(ARM) 1947 { 1948 static assert(0, "TODO"); 1949 } 1950 else 1951 { 1952 static assert(0, "Unsupported on this architecture"); 1953 } 1954 } 1955 } 1956 1957 // binary multiply 1958 template mul(SIMDVer Ver = simdVer, T) 1959 { 1960 @attribute("target", targetNames[Ver]) 1961 T mul(inout T v1, inout T v2) 1962 { 1963 version(X86_OR_X64) 1964 { 1965 version(DigitalMars) 1966 { 1967 static if(isOfType!(T, double2)) 1968 return __simd(XMM.MULPD, v1, v2); 1969 else static if(isOfType!(T, float4)) 1970 return __simd(XMM.MULPS, v1, v2); 1971 else static if(is64bitElement!T) // 9 ops : 5 lat (scalar possibly faster?) 1972 { 1973 T l = __simd(XMM.PMULUDQ, v1, v2); 1974 T h1 = __simd(XMM.PMULUDQ, v1, shiftRightImmediate!(32, Ver)(cast(ulong2)v2)); 1975 T h2 = __simd(XMM.PMULUDQ, v2, shiftRightImmediate!(32, Ver)(cast(ulong2)v1)); 1976 return add!Ver(l, add!Ver(shiftLeftImmediate!(32, Ver)(cast(ulong2)h1), shiftLeftImmediate!(32, Ver)(cast(ulong2)h2))); 1977 } 1978 else static if(is32bitElement!T) 1979 { 1980 static if(Ver >= SIMDVer.SSE41) 1981 { 1982 return __simd(XMM.PMULLD, v1, v2); 1983 } 1984 else // 7 ops : 4 lat (scalar possibly faster?) 1985 { 1986 T t1 = shiftBytesLeftImmediate!(4, Ver)(v1); 1987 T t2 = shiftBytesLeftImmediate!(4, Ver)(v2); 1988 T r1 = __simd(XMM.PMULUDQ, v1, v2); // x, z 1989 T r2 = __simd(XMM.PMULUDQ, t1, t2); // y, w 1990 return interleaveLow!Ver(swizzle!("xzxz", Ver)(r1), swizzle!("xzxz", Ver)(r2)); 1991 } 1992 } 1993 else static if(is16bitElement!T) 1994 return __simd(XMM.PMULLW, v1, v2); 1995 else static if(is8bitElement!T) 1996 { 1997 static if(Ver >= SIMDVer.SSSE3) // 9 ops : 4 lat 1998 { 1999 // should we do this? it is very inefficient... 2000 // perhaps it's just better to just admit that SSE doesn't support byte mul? 2001 static assert(0, "Not implemented: this is really inefficient..."); 2002 // vpunpckhbw %xmm2, %xmm2, %xmm3 2003 // vpunpckhbw %xmm1, %xmm1, %xmm0 2004 // vpunpcklbw %xmm2, %xmm2, %xmm2 2005 // vpunpcklbw %xmm1, %xmm1, %xmm1 2006 // vpmullw %xmm0, %xmm3, %xmm0 2007 // vpshufb .LC1(%rip), %xmm0, %xmm0 2008 // vpmullw %xmm1, %xmm2, %xmm1 2009 // vpshufb .LC0(%rip), %xmm1, %xmm1 2010 // vpor %xmm0, %xmm1, %xmm0 2011 } 2012 else 2013 static assert(0, "Only supported in SSSE3 and above"); 2014 } 2015 else 2016 static assert(0, "Unsupported vector type: " ~ T.stringof); 2017 } 2018 else 2019 { 2020 return v1 * v2; 2021 } 2022 } 2023 else version(ARM) 2024 { 2025 static if(isOfType!(T, float4)) 2026 return __builtin_neon_vmulv4sf(v1, v2, ARMOpType!T); 2027 else static if(is64bitInteger!T) 2028 return __builtin_neon_vmulv2di(v1, v2, ARMOpType!T); 2029 else static if(is32bitElement!T) 2030 return __builtin_neon_vmulv4si(v1, v2, ARMOpType!T); 2031 else static if(is16bitElement!T) 2032 return __builtin_neon_vmulv8hi(v1, v2, ARMOpType!T); 2033 else static if(is8bitElement!T) 2034 return __builtin_neon_vmulv16qi(v1, v2, ARMOpType!T); 2035 else 2036 static assert(0, "Unsupported vector type: " ~ T.stringof); 2037 } 2038 else 2039 { 2040 static assert(0, "Unsupported on this architecture"); 2041 } 2042 } 2043 } 2044 2045 // multiply and add: v1*v2 + v3 2046 template madd(SIMDVer Ver = simdVer, T) 2047 { 2048 @attribute("target", targetNames[Ver]) 2049 T madd(inout T v1, inout T v2, inout T v3) 2050 { 2051 version(X86_OR_X64) 2052 { 2053 version(DigitalMars) 2054 { 2055 static if(isOfType!(T, double2) && Ver == SIMDVer.SSE5) 2056 return __simd(XMM.FMADDPD, v1, v2, v3); 2057 else static if(isOfType!(T, float4) && Ver == SIMDVer.SSE5) 2058 return __simd(XMM.FMADDPS, v1, v2, v3); 2059 else 2060 return add!Ver(mul!Ver(v1, v2), v3); 2061 } 2062 else version(GNU_OR_LDC) // TODO: declare the SSE5 builtins for LDC 2063 { 2064 static if(isOfType!(T, double2) && Ver == SIMDVer.SSE5) 2065 return __builtin_ia32_fmaddpd(v1, v2, v3); 2066 else static if(isOfType!(T, float4) && Ver == SIMDVer.SSE5) 2067 return __builtin_ia32_fmaddps(v1, v2, v3); 2068 else 2069 return add!Ver(mul!Ver(v1, v2), v3); 2070 } 2071 } 2072 else version(ARM) 2073 { 2074 static if(false)//Ver == SIMDVer.VFPv4) 2075 { 2076 // VFPv4 has better opcodes, but i can't find the intrinsics right now >_< 2077 // VFMA, VFMS, VFNMA, and VFNMS 2078 } 2079 else 2080 { 2081 static if(isOfType!(T, float4)) 2082 return __builtin_neon_vmlav4sf(v3, v1, v2, ARMOpType!T); 2083 else static if(is64bitInteger!T) 2084 return __builtin_neon_vmlav2di(v3, v1, v2, ARMOpType!T); 2085 else static if(is32bitElement!T) 2086 return __builtin_neon_vmlav4si(v3, v1, v2, ARMOpType!T); 2087 else static if(is16bitElement!T) 2088 return __builtin_neon_vmlav8hi(v3, v1, v2, ARMOpType!T); 2089 else static if(is8bitElement!T) 2090 return __builtin_neon_vmlav16qi(v3, v1, v2, ARMOpType!T); 2091 else 2092 static assert(0, "Unsupported vector type: " ~ T.stringof); 2093 } 2094 } 2095 else 2096 { 2097 static assert(0, "Unsupported on this architecture"); 2098 } 2099 } 2100 } 2101 // multiply and subtract: v1*v2 - v3 2102 template msub(SIMDVer Ver = simdVer, T) 2103 { 2104 @attribute("target", targetNames[Ver]) 2105 T msub(inout T v1, inout T v2, inout T v3) 2106 { 2107 version(X86_OR_X64) 2108 { 2109 version(DigitalMars) 2110 { 2111 return sub!Ver(mul!Ver(v1, v2), v3); 2112 } 2113 else version(GNU_OR_LDC) // TODO: declare the SSE5 builtins for LDC 2114 { 2115 static if(isOfType!(T, double2) && Ver == SIMDVer.SSE5) 2116 return __builtin_ia32_fmsubpd(v1, v2, v3); 2117 else static if(isOfType!(T, float4) && Ver == SIMDVer.SSE5) 2118 return __builtin_ia32_fmsubps(v1, v2, v3); 2119 else 2120 return sub!Ver(mul!Ver(v1, v2), v3); 2121 } 2122 } 2123 else version(ARM) 2124 { 2125 static if(false)//Ver == SIMDVer.VFPv4) 2126 { 2127 // VFPv4 has better opcodes, but i can't find the intrinsics right now >_< 2128 // VFMA, VFMS, VFNMA, and VFNMS 2129 } 2130 else 2131 { 2132 return sub!Ver(mul!Ver(v1, v2), v3); 2133 } 2134 } 2135 else 2136 { 2137 static assert(0, "Unsupported on this architecture"); 2138 } 2139 } 2140 } 2141 2142 // negate multiply and add: -(v1*v2) + v3 2143 template nmadd(SIMDVer Ver = simdVer, T) 2144 { 2145 @attribute("target", targetNames[Ver]) 2146 T nmadd(inout T v1, inout T v2, inout T v3) 2147 { 2148 version(X86_OR_X64) 2149 { 2150 version(DigitalMars) 2151 { 2152 return sub!Ver(v3, mul!Ver(v1, v2)); 2153 } 2154 else version(GNU_OR_LDC) // TODO: declare the SSE5 builtins for LDC 2155 { 2156 static if(isOfType!(T, double2) && Ver == SIMDVer.SSE5) 2157 return __builtin_ia32_fnmaddpd(v1, v2, v3); 2158 else static if(isOfType!(T, float4) && Ver == SIMDVer.SSE5) 2159 return __builtin_ia32_fnmaddps(v1, v2, v3); 2160 else 2161 return sub!Ver(v3, mul!Ver(v1, v2)); 2162 } 2163 } 2164 else version(ARM) 2165 { 2166 static if(false)//Ver == SIMDVer.VFPv4) 2167 { 2168 // VFPv4 has better opcodes, but i can't find the intrinsics right now >_< 2169 // VFMA, VFMS, VFNMA, and VFNMS 2170 } 2171 else 2172 { 2173 // Note: ARM's msub is backwards, it performs: r = r - a*b 2174 // Which is identical to the conventinal nmadd: r = -(a*b) + c 2175 2176 static if(isOfType!(T, float4)) 2177 return __builtin_neon_vmlsv4sf(v3, v1, v2, ARMOpType!T); 2178 else static if(is64bitInteger!T) 2179 return __builtin_neon_vmlsv2di(v3, v1, v2, ARMOpType!T); 2180 else static if(is32bitElement!T) 2181 return __builtin_neon_vmlsv4si(v3, v1, v2, ARMOpType!T); 2182 else static if(is16bitElement!T) 2183 return __builtin_neon_vmlsv8hi(v3, v1, v2, ARMOpType!T); 2184 else static if(is8bitElement!T) 2185 return __builtin_neon_vmlsv16qi(v3, v1, v2, ARMOpType!T); 2186 else 2187 static assert(0, "Unsupported vector type: " ~ T.stringof); 2188 } 2189 } 2190 else version(PowerPC) 2191 { 2192 // note PowerPC also has an opcode for this... 2193 static assert(0, "Unsupported on this architecture"); 2194 } 2195 else 2196 { 2197 static assert(0, "Unsupported on this architecture"); 2198 } 2199 } 2200 } 2201 2202 // negate multiply and subtract: -(v1*v2) - v3 2203 template nmsub(SIMDVer Ver = simdVer, T) 2204 { 2205 @attribute("target", targetNames[Ver]) 2206 T nmsub(inout T v1, inout T v2, inout T v3) 2207 { 2208 version(X86_OR_X64) 2209 { 2210 version(DigitalMars) 2211 { 2212 return sub!Ver(neg!Ver(v3), mul!Ver(v1, v2)); 2213 } 2214 else version(GNU_OR_LDC) // TODO: declare the SSE5 builtins for LDC 2215 { 2216 static if(isOfType!(T, double2) && Ver == SIMDVer.SSE5) 2217 return __builtin_ia32_fnmsubpd(v1, v2, v3); 2218 else static if(isOfType!(T, float4) && Ver == SIMDVer.SSE5) 2219 return __builtin_ia32_fnmsubps(v1, v2, v3); 2220 else 2221 return sub!Ver(neg!Ver(v3), mul!Ver(v1, v2)); 2222 } 2223 } 2224 else version(ARM) 2225 { 2226 static if(false)//Ver == SIMDVer.VFPv4) 2227 { 2228 // VFPv4 has better opcodes, but i can't find the intrinsics right now >_< 2229 // VFMA, VFMS, VFNMA, and VFNMS 2230 } 2231 else 2232 { 2233 return nmadd!Ver(v1, v2, neg!Ver(v3)); 2234 } 2235 } 2236 else 2237 { 2238 static assert(0, "Unsupported on this architecture"); 2239 } 2240 } 2241 } 2242 2243 // min 2244 template min(SIMDVer Ver = simdVer, T) 2245 { 2246 @attribute("target", targetNames[Ver]) 2247 T min(inout T v1, inout T v2) 2248 { 2249 version(X86_OR_X64) 2250 { 2251 version(DigitalMars) 2252 { 2253 static if(isOfType!(T, double2)) 2254 return __simd(XMM.MINPD, v1, v2); 2255 else static if(isOfType!(T, float4)) 2256 return __simd(XMM.MINPS, v1, v2); 2257 else static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 2258 return selectGreater!Ver(v1, v2, v2, v1); 2259 else static if(isOfType!(T, int4)) 2260 { 2261 static if(Ver >= SIMDVer.SSE41) 2262 return __simd(XMM.PMINSD, v1, v2); 2263 else 2264 return selectGreater!Ver(v1, v2, v2, v1); 2265 } 2266 else static if(isOfType!(T, uint4)) 2267 { 2268 static if(Ver >= SIMDVer.SSE41) 2269 return __simd(XMM.PMINUD, v1, v2); 2270 else 2271 return selectGreater!Ver(v1, v2, v2, v1); 2272 } 2273 else static if(isOfType!(T, short8)) 2274 return __simd(XMM.PMINSW, v1, v2); // available in SSE2 2275 else static if(isOfType!(T, ushort8)) 2276 { 2277 static if(Ver >= SIMDVer.SSE41) 2278 return __simd(XMM.PMINUW, v1, v2); 2279 else 2280 return selectGreater!Ver(v1, v2, v2, v1); 2281 } 2282 else static if(isOfType!(T, byte16)) 2283 { 2284 static if(Ver >= SIMDVer.SSE41) 2285 return __simd(XMM.PMINSB, v1, v2); 2286 else 2287 return selectGreater!Ver(v1, v2, v2, v1); 2288 } 2289 else static if(isOfType!(T, ubyte16)) 2290 return __simd(XMM.PMINUB, v1, v2); // available in SSE2 2291 else 2292 static assert(0, "Unsupported vector type: " ~ T.stringof); 2293 } 2294 else version(GNU_OR_LDC) 2295 { 2296 static if(isOfType!(T, double2)) 2297 return __builtin_ia32_minpd(v1, v2); 2298 else static if(isOfType!(T, float4)) 2299 return __builtin_ia32_minps(v1, v2); 2300 else static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 2301 return selectGreater!Ver(v1, v2, v2, v1); 2302 else static if(isOfType!(T, int4)) 2303 { 2304 static if(Ver >= SIMDVer.SSE41) 2305 return __builtin_ia32_pminsd128(v1, v2); 2306 else 2307 return selectGreater!Ver(v1, v2, v2, v1); 2308 } 2309 else static if(isOfType!(T, uint4)) 2310 { 2311 static if(Ver >= SIMDVer.SSE41) 2312 return __builtin_ia32_pminud128(v1, v2); 2313 else 2314 return selectGreater!Ver(v1, v2, v2, v1); 2315 } 2316 else static if(isOfType!(T, short8)) 2317 return __builtin_ia32_pminsw128(v1, v2); // available in SSE2 2318 else static if(isOfType!(T, ushort8)) 2319 { 2320 static if(Ver >= SIMDVer.SSE41) 2321 return __builtin_ia32_pminuw128(v1, v2); 2322 else 2323 return selectGreater!Ver(v1, v2, v2, v1); 2324 } 2325 else static if(isOfType!(T, byte16)) 2326 { 2327 static if(Ver >= SIMDVer.SSE41) 2328 return __builtin_ia32_pminsb128(v1, v2); 2329 else 2330 return selectGreater!Ver(v1, v2, v2, v1); 2331 } 2332 else static if(isOfType!(T, ubyte16)) 2333 return __builtin_ia32_pminub128(v1, v2); // available in SSE2 2334 else 2335 static assert(0, "Unsupported vector type: " ~ T.stringof); 2336 } 2337 } 2338 else version(ARM) 2339 { 2340 static if(isOfType!(T, float4)) 2341 return __builtin_neon_vminv4sf(v1, v2, ARMOpType!T); 2342 else static if(is64bitInteger!T) 2343 return __builtin_neon_vminv2di(v1, v2, ARMOpType!T); 2344 else static if(is32bitElement!T) 2345 return __builtin_neon_vminv4si(v1, v2, ARMOpType!T); 2346 else static if(is16bitElement!T) 2347 return __builtin_neon_vminv8hi(v1, v2, ARMOpType!T); 2348 else static if(is8bitElement!T) 2349 return __builtin_neon_vminv16qi(v1, v2, ARMOpType!T); 2350 else 2351 static assert(0, "Unsupported vector type: " ~ T.stringof); 2352 } 2353 else 2354 { 2355 static assert(0, "Unsupported on this architecture"); 2356 } 2357 } 2358 } 2359 2360 // max 2361 template max(SIMDVer Ver = simdVer, T) 2362 { 2363 @attribute("target", targetNames[Ver]) 2364 T max(inout T v1, inout T v2) 2365 { 2366 version(X86_OR_X64) 2367 { 2368 version(DigitalMars) 2369 { 2370 static if(isOfType!(T, double2)) 2371 return __simd(XMM.MAXPD, v1, v2); 2372 else static if(isOfType!(T, float4)) 2373 return __simd(XMM.MAXPS, v1, v2); 2374 else static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 2375 return selectGreater!Ver(v1, v2, v1, v2); 2376 else static if(isOfType!(T, int4)) 2377 { 2378 static if(Ver >= SIMDVer.SSE41) 2379 return __simd(XMM.PMAXSD, v1, v2); 2380 else 2381 return selectGreater!Ver(v1, v2, v1, v2); 2382 } 2383 else static if(isOfType!(T, uint4)) 2384 { 2385 static if(Ver >= SIMDVer.SSE41) 2386 return __simd(XMM.PMAXUD, v1, v2); 2387 else 2388 return selectGreater!Ver(v1, v2, v1, v2); 2389 } 2390 else static if(isOfType!(T, short8)) 2391 return __simd(XMM.PMAXSW, v1, v2); // available in SSE2 2392 else static if(isOfType!(T, ushort8)) 2393 { 2394 static if(Ver >= SIMDVer.SSE41) 2395 return __simd(XMM.PAXUW, v1, v2); 2396 else 2397 return selectGreater!Ver(v1, v2, v1, v2); 2398 } 2399 else static if(isOfType!(T, byte16)) 2400 { 2401 static if(Ver >= SIMDVer.SSE41) 2402 return __simd(XMM.PMAXSB, v1, v2); 2403 else 2404 return selectGreater!Ver(v1, v2, v1, v2); 2405 } 2406 else static if(isOfType!(T, ubyte16)) 2407 return __simd(XMM.PMAXUB, v1, v2); // available in SSE2 2408 else 2409 static assert(0, "Unsupported vector type: " ~ T.stringof); 2410 } 2411 else version(GNU_OR_LDC) 2412 { 2413 static if(isOfType!(T, double2)) 2414 return __builtin_ia32_maxpd(v1, v2); 2415 else static if(isOfType!(T, float4)) 2416 return __builtin_ia32_maxps(v1, v2); 2417 else static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 2418 return selectGreater!Ver(v1, v2, v1, v2); 2419 else static if(isOfType!(T, int4)) 2420 { 2421 static if(Ver >= SIMDVer.SSE41) 2422 return __builtin_ia32_pmaxsd128(v1, v2); 2423 else 2424 return selectGreater!Ver(v1, v2, v1, v2); 2425 } 2426 else static if(isOfType!(T, uint4)) 2427 { 2428 static if(Ver >= SIMDVer.SSE41) 2429 return __builtin_ia32_pmaxud128(v1, v2); 2430 else 2431 return selectGreater!Ver(v1, v2, v1, v2); 2432 } 2433 else static if(isOfType!(T, short8)) 2434 return __builtin_ia32_pmaxsw128(v1, v2); // available in SSE2 2435 else static if(isOfType!(T, ushort8)) 2436 { 2437 static if(Ver >= SIMDVer.SSE41) 2438 return __builtin_ia32_pmaxuw128(v1, v2); 2439 else 2440 return selectGreater!Ver(v1, v2, v1, v2); 2441 } 2442 else static if(isOfType!(T, byte16)) 2443 { 2444 static if(Ver >= SIMDVer.SSE41) 2445 return __builtin_ia32_pmaxsb128(v1, v2); 2446 else 2447 return selectGreater!Ver(v1, v2, v1, v2); 2448 } 2449 else static if(isOfType!(T, ubyte16)) 2450 return __builtin_ia32_pmaxub128(v1, v2); // available in SSE2 2451 else 2452 static assert(0, "Unsupported vector type: " ~ T.stringof); 2453 } 2454 } 2455 else version(ARM) 2456 { 2457 static if(isOfType!(T, float4)) 2458 return __builtin_neon_vmaxv4sf(v1, v2, ARMOpType!T); 2459 else static if(is64bitInteger!T) 2460 return __builtin_neon_vmaxv2di(v1, v2, ARMOpType!T); 2461 else static if(is32bitElement!T) 2462 return __builtin_neon_vmaxv4si(v1, v2, ARMOpType!T); 2463 else static if(is16bitElement!T) 2464 return __builtin_neon_vmaxv8hi(v1, v2, ARMOpType!T); 2465 else static if(is8bitElement!T) 2466 return __builtin_neon_vmaxv16qi(v1, v2, ARMOpType!T); 2467 else 2468 static assert(0, "Unsupported vector type: " ~ T.stringof); 2469 } 2470 else 2471 { 2472 static assert(0, "Unsupported on this architecture"); 2473 } 2474 } 2475 } 2476 2477 // clamp values such that a <= v <= b 2478 template clamp(SIMDVer Ver = simdVer, T) 2479 { 2480 @attribute("target", targetNames[Ver]) 2481 T clamp(inout T a, inout T v, inout T b) 2482 { 2483 return max!Ver(a, min!Ver(v, b)); 2484 } 2485 } 2486 2487 // lerp 2488 template lerp(SIMDVer Ver = simdVer, T) 2489 { 2490 @attribute("target", targetNames[Ver]) 2491 T lerp(inout T a, inout T b, inout T t) 2492 { 2493 return madd!Ver(sub!Ver(b, a), t, a); 2494 } 2495 } 2496 2497 2498 /////////////////////////////////////////////////////////////////////////////// 2499 // Floating point operations 2500 2501 // round to the next lower integer value 2502 T floor(SIMDVer Ver = simdVer, T)(inout T v) 2503 { 2504 version(X86_OR_X64) 2505 { 2506 version(DigitalMars) 2507 { 2508 static assert(0, "WAITING FOR DMD"); 2509 } 2510 else version(GNU_OR_LDC) 2511 { 2512 static if(isOfType!(T, double2)) 2513 { 2514 static if(Ver >= SIMDVer.SSE41) 2515 return __builtin_ia32_roundpd(v, 1); 2516 else 2517 static assert(0, "Only supported in SSE4.1 and above"); 2518 } 2519 else static if(isOfType!(T, float4)) 2520 { 2521 static if(Ver >= SIMDVer.SSE41) 2522 return __builtin_ia32_roundps(v, 1); 2523 else 2524 static assert(0, "Only supported in SSE4.1 and above"); 2525 } 2526 else 2527 { 2528 static assert(0, "Unsupported vector type: " ~ T.stringof); 2529 /* 2530 static const vFloat twoTo23 = (vFloat){ 0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f }; 2531 vFloat b = (vFloat) _mm_srli_epi32( _mm_slli_epi32( (vUInt32) v, 1 ), 1 ); //fabs(v) 2532 vFloat d = _mm_sub_ps( _mm_add_ps( _mm_add_ps( _mm_sub_ps( v, twoTo23 ), twoTo23 ), twoTo23 ), twoTo23 ); //the meat of floor 2533 vFloat largeMaskE = (vFloat) _mm_cmpgt_ps( b, twoTo23 ); //-1 if v >= 2**23 2534 vFloat g = (vFloat) _mm_cmplt_ps( v, d ); //check for possible off by one error 2535 vFloat h = _mm_cvtepi32_ps( (vUInt32) g ); //convert positive check result to -1.0, negative to 0.0 2536 vFloat t = _mm_add_ps( d, h ); //add in the error if there is one 2537 2538 //Select between output result and input value based on v >= 2**23 2539 v = _mm_and_ps( v, largeMaskE ); 2540 t = _mm_andnot_ps( largeMaskE, t ); 2541 2542 return _mm_or_ps( t, v ); 2543 */ 2544 } 2545 } 2546 } 2547 else version(ARM) 2548 { 2549 static assert(0, "TODO"); 2550 } 2551 else 2552 { 2553 static assert(0, "Unsupported on this architecture"); 2554 } 2555 } 2556 2557 // round to the next higher integer value 2558 T ceil(SIMDVer Ver = simdVer, T)(inout T v) 2559 { 2560 version(X86_OR_X64) 2561 { 2562 version(DigitalMars) 2563 { 2564 static assert(0, "WAITING FOR DMD"); 2565 } 2566 else version(GNU_OR_LDC) 2567 { 2568 static if(isOfType!(T, double2)) 2569 { 2570 static if(Ver >= SIMDVer.SSE41) 2571 return __builtin_ia32_roundpd(v, 2); 2572 else 2573 static assert(0, "Only supported in SSE4.1 and above"); 2574 } 2575 else static if(isOfType!(T, float4)) 2576 { 2577 static if(Ver >= SIMDVer.SSE41) 2578 return __builtin_ia32_roundps(v, 2); 2579 else 2580 static assert(0, "Only supported in SSE4.1 and above"); 2581 } 2582 else 2583 static assert(0, "Unsupported vector type: " ~ T.stringof); 2584 } 2585 } 2586 else version(ARM) 2587 { 2588 static assert(0, "TODO"); 2589 } 2590 else 2591 { 2592 static assert(0, "Unsupported on this architecture"); 2593 } 2594 } 2595 2596 // round to the nearest integer value 2597 T round(SIMDVer Ver = simdVer, T)(inout T v) 2598 { 2599 version(X86_OR_X64) 2600 { 2601 version(DigitalMars) 2602 { 2603 static assert(0, "WAITING FOR DMD"); 2604 } 2605 else version(GNU_OR_LDC) 2606 { 2607 static if(isOfType!(T, double2)) 2608 { 2609 static if(Ver >= SIMDVer.SSE41) 2610 return __builtin_ia32_roundpd(v, 0); 2611 else 2612 static assert(0, "Only supported in SSE4.1 and above"); 2613 } 2614 else static if(isOfType!(T, float4)) 2615 { 2616 static if(Ver >= SIMDVer.SSE41) 2617 return __builtin_ia32_roundps(v, 0); 2618 else 2619 static assert(0, "Only supported in SSE4.1 and above"); 2620 } 2621 else 2622 static assert(0, "Unsupported vector type: " ~ T.stringof); 2623 } 2624 } 2625 else version(ARM) 2626 { 2627 static assert(0, "TODO"); 2628 } 2629 else 2630 { 2631 static assert(0, "Unsupported on this architecture"); 2632 } 2633 } 2634 2635 // truncate fraction (round towards zero) 2636 T trunc(SIMDVer Ver = simdVer, T)(inout T v) 2637 { 2638 version(X86_OR_X64) 2639 { 2640 version(DigitalMars) 2641 { 2642 static assert(0, "WAITING FOR DMD"); 2643 } 2644 else version(GNU_OR_LDC) 2645 { 2646 static if(isOfType!(T, double2)) 2647 { 2648 static if(Ver >= SIMDVer.SSE41) 2649 return __builtin_ia32_roundpd(v, 3); 2650 else 2651 static assert(0, "Only supported in SSE4.1 and above"); 2652 } 2653 else static if(isOfType!(T, float4)) 2654 { 2655 static if(Ver >= SIMDVer.SSE41) 2656 return __builtin_ia32_roundps(v, 3); 2657 else 2658 static assert(0, "Only supported in SSE4.1 and above"); 2659 } 2660 else 2661 { 2662 static assert(0, "Unsupported vector type: " ~ T.stringof); 2663 /* 2664 static const vFloat twoTo23 = (vFloat){ 0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f }; 2665 vFloat b = (vFloat) _mm_srli_epi32( _mm_slli_epi32( (vUInt32) v, 1 ), 1 ); //fabs(v) 2666 vFloat d = _mm_sub_ps( _mm_add_ps( b, twoTo23 ), twoTo23 ); //the meat of floor 2667 vFloat largeMaskE = (vFloat) _mm_cmpgt_ps( b, twoTo23 ); //-1 if v >= 2**23 2668 vFloat g = (vFloat) _mm_cmplt_ps( b, d ); //check for possible off by one error 2669 vFloat h = _mm_cvtepi32_ps( (vUInt32) g ); //convert positive check result to -1.0, negative to 0.0 2670 vFloat t = _mm_add_ps( d, h ); //add in the error if there is one 2671 2672 //put the sign bit back 2673 vFloat sign = (vFloat) _mm_slli_epi31( _mm_srli128( (vUInt32) v, 31), 31 ); 2674 t = _mm_or_ps( t, sign ); 2675 2676 //Select between output result and input value based on fabs(v) >= 2**23 2677 v = _mm_and_ps( v, largeMaskE ); 2678 t = _mm_andnot_ps( largeMaskE, t ); 2679 2680 return _mm_or_ps( t, v ); 2681 */ 2682 } 2683 } 2684 } 2685 else version(ARM) 2686 { 2687 static assert(0, "TODO"); 2688 } 2689 else 2690 { 2691 static assert(0, "Unsupported on this architecture"); 2692 } 2693 } 2694 2695 /////////////////////////////////////////////////////////////////////////////// 2696 // Precise mathematical operations 2697 2698 // divide 2699 T div(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 2700 { 2701 version(X86_OR_X64) 2702 { 2703 return v1 / v2; 2704 } 2705 else version(ARM) 2706 { 2707 return mul!Ver(v1, rcp!Ver(v2)); 2708 } 2709 else 2710 { 2711 static assert(0, "Unsupported on this architecture"); 2712 } 2713 } 2714 2715 // reciprocal 2716 T rcp(SIMDVer Ver = simdVer, T)(inout T v) 2717 { 2718 version(X86_OR_X64) 2719 { 2720 version(DigitalMars) 2721 { 2722 static if(isOfType!(T, double2)) 2723 return div!(Ver, T)(1.0, v); 2724 else static if(isOfType!(T, float4)) 2725 return div!(Ver, T)(1.0f, v); 2726 else 2727 static assert(0, "Unsupported vector type: " ~ T.stringof); 2728 } 2729 else version(GNU_OR_LDC) 2730 { 2731 static if(isOfType!(T, double2) || isOfType!(T, float4)) 2732 { 2733 T one = 1; 2734 return div!Ver(one, v); 2735 } 2736 else 2737 static assert(0, "Unsupported vector type: " ~ T.stringof); 2738 } 2739 } 2740 else version(ARM) 2741 { 2742 static assert(0, "TODO!"); 2743 static if(isOfType!(T, float4)) 2744 return null; 2745 else 2746 static assert(0, "Unsupported vector type: " ~ T.stringof); 2747 } 2748 else 2749 { 2750 static assert(0, "Unsupported on this architecture"); 2751 } 2752 } 2753 2754 // square root 2755 T sqrt(SIMDVer Ver = simdVer, T)(inout T v) 2756 { 2757 version(X86_OR_X64) 2758 { 2759 version(DigitalMars) 2760 { 2761 static if(isOfType!(T, double2)) 2762 return __simd(XMM.SQRTPD, v); 2763 else static if(isOfType!(T, float4)) 2764 return __simd(XMM.SQRTPS, v); 2765 else 2766 static assert(0, "Unsupported vector type: " ~ T.stringof); 2767 } 2768 else version(GNU_OR_LDC) 2769 { 2770 static if(isOfType!(T, double2)) 2771 return __builtin_ia32_sqrtpd(v); 2772 else static if(isOfType!(T, float4)) 2773 return __builtin_ia32_sqrtps(v); 2774 else 2775 static assert(0, "Unsupported vector type: " ~ T.stringof); 2776 } 2777 } 2778 else version(ARM) 2779 { 2780 static assert(0, "TODO!"); 2781 static if(isOfType!(T, float4)) 2782 return null; 2783 else 2784 static assert(0, "Unsupported vector type: " ~ T.stringof); 2785 } 2786 else 2787 { 2788 static assert(0, "Unsupported on this architecture"); 2789 } 2790 } 2791 2792 // reciprocal square root 2793 T rsqrt(SIMDVer Ver = simdVer, T)(inout T v) 2794 { 2795 version(X86_OR_X64) 2796 { 2797 version(DigitalMars) 2798 { 2799 static if(isOfType!(T, double2) || isOfType!(T, float4)) 2800 return rcp!Ver(sqrt!Ver(v)); 2801 else 2802 static assert(0, "Unsupported vector type: " ~ T.stringof); 2803 } 2804 else version(GNU_OR_LDC) 2805 { 2806 static if(isOfType!(T, double2) || isOfType!(T, float4)) 2807 return rcp!Ver(sqrt!Ver(v)); 2808 else 2809 static assert(0, "Unsupported vector type: " ~ T.stringof); 2810 } 2811 } 2812 else version(ARM) 2813 { 2814 static assert(0, "TODO!"); 2815 static if(isOfType!(T, float4)) 2816 return null; 2817 else 2818 static assert(0, "Unsupported vector type: " ~ T.stringof); 2819 } 2820 else 2821 { 2822 static assert(0, "Unsupported on this architecture"); 2823 } 2824 } 2825 2826 2827 /////////////////////////////////////////////////////////////////////////////// 2828 // Vector maths operations 2829 2830 // 2d dot product 2831 T dot2(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 2832 { 2833 version(X86_OR_X64) 2834 { 2835 version(DigitalMars) 2836 { 2837 static if(isOfType!(T, double2)) 2838 { 2839 static if(Ver >= SIMDVer.SSE41) // 1 op 2840 return __simd(XMM.DPPD, v1, v2, 0x33); 2841 else static if(Ver >= SIMDVer.SSE3) // 2 ops 2842 { 2843 double2 t = v1 * v2; 2844 return __simd(XMM.HADDPD, t, t); 2845 } 2846 else // 5 ops 2847 { 2848 double2 t = v1 * v2; 2849 return getX!Ver(t) + getY!Ver(t); 2850 } 2851 } 2852 else static if(isOfType!(T, float4)) 2853 { 2854 static if(Ver >= SIMDVer.SSE41) // 1 op 2855 return __simd(XMM.DPPS, v1, v2, 0x3F); 2856 else static if(Ver >= SIMDVer.SSE3) // 3 ops 2857 { 2858 float4 t = v1 * v2; 2859 t = __simd(XMM.HADDPS, t, t); 2860 return swizzle!("XXZZ", Ver)(t); 2861 } 2862 else // 5 ops 2863 { 2864 float4 t = v1 * v2; 2865 return getX!Ver(t) + getY!Ver(t); 2866 } 2867 } 2868 else 2869 static assert(0, "Unsupported vector type: " ~ T.stringof); 2870 } 2871 else version(GNU_OR_LDC) 2872 { 2873 static if(isOfType!(T, double2)) 2874 { 2875 static if(Ver >= SIMDVer.SSE41) // 1 op 2876 return __builtin_ia32_dppd(v1, v2, 0x33); 2877 else static if(Ver >= SIMDVer.SSE3) // 2 ops 2878 { 2879 double2 t = v1 * v2; 2880 return __builtin_ia32_haddpd(t, t); 2881 } 2882 else // 5 ops 2883 { 2884 double2 t = v1 * v2; 2885 return getX!Ver(t) + getY!Ver(t); 2886 } 2887 } 2888 else static if(isOfType!(T, float4)) 2889 { 2890 static if(Ver >= SIMDVer.SSE41) // 1 op 2891 return __builtin_ia32_dpps(v1, v2, 0x3F); 2892 else static if(Ver >= SIMDVer.SSE3) // 3 ops 2893 { 2894 float4 t = v1 * v2; 2895 t = __builtin_ia32_haddps(t, t); 2896 return swizzle!("XXZZ", Ver)(t); 2897 } 2898 else // 5 ops 2899 { 2900 float4 t = v1 * v2; 2901 return getX!Ver(t) + getY!Ver(t); 2902 } 2903 } 2904 else 2905 static assert(0, "Unsupported vector type: " ~ T.stringof); 2906 } 2907 } 2908 else version(ARM) 2909 { 2910 static assert(0, "TODO"); 2911 } 2912 else 2913 { 2914 static assert(0, "Unsupported on this architecture"); 2915 } 2916 } 2917 2918 // 3d dot product 2919 T dot3(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 2920 { 2921 version(X86_OR_X64) 2922 { 2923 version(DigitalMars) 2924 { 2925 static if(isOfType!(T, float4)) 2926 { 2927 static if(Ver >= SIMDVer.SSE41) // 1 op 2928 return __simd(XMM.DPPS, v1, v2, 0x7F); 2929 else static if(Ver >= SIMDVer.SSE3) // 4 ops 2930 { 2931 float4 t = shiftElementsRight!(1, Ver)(v1 * v2); 2932 t = __simd(XMM.HADDPS, t, t); 2933 return __simd(XMM.HADDPS, t, t); 2934 } 2935 else // 8 ops!... surely we can do better than this? 2936 { 2937 float4 t = shiftElementsRight!(1, Ver)(v1 * v2); 2938 t = t + swizzle!("yxwz", Ver)(t); 2939 return t + swizzle!("zzxx", Ver)(t); 2940 } 2941 } 2942 else 2943 static assert(0, "Unsupported vector type: " ~ T.stringof); 2944 } 2945 else version(GNU_OR_LDC) 2946 { 2947 static if(isOfType!(T, float4)) 2948 { 2949 static if(Ver >= SIMDVer.SSE41) // 1 op 2950 return __builtin_ia32_dpps(v1, v2, 0x7F); 2951 else static if(Ver >= SIMDVer.SSE3) // 4 ops 2952 { 2953 float4 t = shiftElementsRight!(1, Ver)(v1 * v2); 2954 t = __builtin_ia32_haddps(t, t); 2955 return __builtin_ia32_haddps(t, t); 2956 } 2957 else // 8 ops!... surely we can do better than this? 2958 { 2959 float4 t = shiftElementsRight!(1, Ver)(v1 * v2); 2960 t = t + swizzle!("yxwz", Ver)(t); 2961 return t + swizzle!("zzxx", Ver)(t); 2962 } 2963 } 2964 else 2965 static assert(0, "Unsupported vector type: " ~ T.stringof); 2966 } 2967 } 2968 else version(ARM) 2969 { 2970 static assert(0, "TODO"); 2971 } 2972 else 2973 { 2974 static assert(0, "Unsupported on this architecture"); 2975 } 2976 } 2977 2978 // 4d dot product 2979 T dot4(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 2980 { 2981 version(X86_OR_X64) 2982 { 2983 version(DigitalMars) 2984 { 2985 static if(isOfType!(T, float4)) 2986 { 2987 static if(Ver >= SIMDVer.SSE41) // 1 op 2988 return __simd(XMM.DPPS, v1, v2, 0xFF); 2989 else static if(Ver >= SIMDVer.SSE3) // 3 ops 2990 { 2991 float4 t = v1 * v2; 2992 t = __simd(XMM.HADDPS, t, t); 2993 return __simd(XMM.HADDPS, t, t); 2994 } 2995 else // 7 ops!... surely we can do better than this? 2996 { 2997 float4 t = v1 * v2; 2998 t = t + swizzle!("yxwz", Ver)(t); 2999 return t + swizzle!("zzxx", Ver)(t); 3000 } 3001 } 3002 else 3003 static assert(0, "Unsupported vector type: " ~ T.stringof); 3004 } 3005 else version(GNU_OR_LDC) 3006 { 3007 static if(isOfType!(T, float4)) 3008 { 3009 static if(Ver >= SIMDVer.SSE41) // 1 op 3010 return __builtin_ia32_dpps(v1, v2, 0xFF); 3011 else static if(Ver >= SIMDVer.SSE3) // 3 ops 3012 { 3013 float4 t = v1 * v2; 3014 t = __builtin_ia32_haddps(t, t); 3015 return __builtin_ia32_haddps(t, t); 3016 } 3017 else // 7 ops!... surely we can do better than this? 3018 { 3019 float4 t = v1 * v2; 3020 t = t + swizzle!("yxwz", Ver)(t); 3021 return t + swizzle!("zzxx", Ver)(t); 3022 } 3023 } 3024 else 3025 static assert(0, "Unsupported vector type: " ~ T.stringof); 3026 } 3027 } 3028 else version(ARM) 3029 { 3030 static assert(0, "TODO"); 3031 } 3032 else 3033 { 3034 static assert(0, "Unsupported on this architecture"); 3035 } 3036 } 3037 3038 // homogeneous dot product: v1.xyz1 dot v2.xyzw 3039 T dotH(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3040 { 3041 return null; 3042 } 3043 3044 // 3d cross product 3045 T cross3(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3046 { 3047 T left = mul!Ver(swizzle!("YZXW", Ver)(v1), swizzle!("ZXYW", Ver)(v2)); 3048 T right = mul!Ver(swizzle!("ZXYW", Ver)(v1), swizzle!("YZXW", Ver)(v2)); 3049 return sub!Ver(left, right); 3050 } 3051 3052 // 3d magnitude 3053 T magnitude3(SIMDVer Ver = simdVer, T)(inout T v) 3054 { 3055 return sqrt!Ver(dot3!Ver(v, v)); 3056 } 3057 3058 // 4d magnitude 3059 T magnitude4(SIMDVer Ver = simdVer, T)(inout T v) 3060 { 3061 return sqrt!Ver(dot4!Ver(v, v)); 3062 } 3063 3064 // 3d normalise 3065 T normalise3(SIMDVer Ver = simdVer, T)(inout T v) 3066 { 3067 return div!Ver(v, magnitude3!Ver(v)); 3068 } 3069 3070 // 4d normalise 3071 T normalise4(SIMDVer Ver = simdVer, T)(inout T v) 3072 { 3073 return div!Ver(v, magnitude4!Ver(v)); 3074 } 3075 3076 // 3d magnitude squared 3077 T magSq3(SIMDVer Ver = simdVer, T)(inout T v) 3078 { 3079 return dot3!Ver(v, v); 3080 } 3081 3082 // 4d magnitude squared 3083 T magSq4(SIMDVer Ver = simdVer, T)(inout T v) 3084 { 3085 return dot4!Ver(v, v); 3086 } 3087 3088 3089 /////////////////////////////////////////////////////////////////////////////// 3090 // Fast estimates 3091 3092 // divide estimate 3093 T divEst(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3094 { 3095 version(X86_OR_X64) 3096 { 3097 static if(isOfType!(T, double2)) 3098 return div!Ver(v1, v2); 3099 else static if(isOfType!(T, float4)) 3100 return mul!Ver(v1, rcpEst!Ver(v2)); 3101 else 3102 static assert(0, "Unsupported vector type: " ~ T.stringof); 3103 } 3104 else version(ARM) 3105 { 3106 return mul!Ver(v1, rcpEst!Ver(v2)); 3107 } 3108 else 3109 { 3110 return div!Ver(v1, v2); 3111 } 3112 } 3113 3114 // reciprocal estimate 3115 T rcpEst(SIMDVer Ver = simdVer, T)(inout T v) 3116 { 3117 version(X86_OR_X64) 3118 { 3119 version(DigitalMars) 3120 { 3121 static if(isOfType!(T, double2)) 3122 return div!(Ver, T)(1.0, v); 3123 else static if(isOfType!(T, float4)) 3124 return __simd(XMM.RCPPS, v); 3125 else 3126 static assert(0, "Unsupported vector type: " ~ T.stringof); 3127 } 3128 else version(GNU_OR_LDC) 3129 { 3130 static if(isOfType!(T, double2)) 3131 { 3132 T one = 1; 3133 return div!Ver(one, v); 3134 } 3135 else static if(isOfType!(T, float4)) 3136 return __builtin_ia32_rcpps(v); 3137 else 3138 static assert(0, "Unsupported vector type: " ~ T.stringof); 3139 } 3140 } 3141 else version(ARM) 3142 { 3143 static if(isOfType!(T, float4)) 3144 return __builtin_neon_vrecpev4sf(v, ARMOpType!T); 3145 else 3146 static assert(0, "Unsupported vector type: " ~ T.stringof); 3147 } 3148 else 3149 { 3150 return rcp!Ver(v); 3151 } 3152 } 3153 3154 // square root estimate 3155 T sqrtEst(SIMDVer Ver = simdVer, T)(inout T v) 3156 { 3157 version(ARM) 3158 { 3159 static assert(0, "TODO: I'm sure ARM has a good estimate for this..."); 3160 } 3161 else 3162 { 3163 return sqrt!Ver(v); 3164 } 3165 } 3166 3167 // reciprocal square root estimate 3168 T rsqrtEst(SIMDVer Ver = simdVer, T)(inout T v) 3169 { 3170 version(X86_OR_X64) 3171 { 3172 version(DigitalMars) 3173 { 3174 static if(isOfType!(T, double2)) 3175 return rcpEst!Ver(sqrtEst!Ver(v)); 3176 else static if(isOfType!(T, float4)) 3177 return __simd(XMM.RSQRTPS, v); 3178 else 3179 static assert(0, "Unsupported vector type: " ~ T.stringof); 3180 } 3181 else version(GNU_OR_LDC) 3182 { 3183 static if(isOfType!(T, double2)) 3184 return rcpEst!Ver(sqrtEst!Ver(v)); 3185 else static if(isOfType!(T, float4)) 3186 return __builtin_ia32_rsqrtps(v); 3187 else 3188 static assert(0, "Unsupported vector type: " ~ T.stringof); 3189 } 3190 } 3191 else version(ARM) 3192 { 3193 static if(isOfType!(T, float4)) 3194 return __builtin_neon_vrsqrtev4sf(v, ARMOpType!T); 3195 else 3196 static assert(0, "Unsupported vector type: " ~ T.stringof); 3197 } 3198 else 3199 { 3200 return rsqrt!Ver(v); 3201 } 3202 } 3203 3204 // 3d magnitude estimate 3205 T magEst3(SIMDVer Ver = simdVer, T)(inout T v) 3206 { 3207 return sqrtEst!Ver(dot3!Ver(v, v)); 3208 } 3209 3210 // 4d magnitude estimate 3211 T magEst4(SIMDVer Ver = simdVer, T)(inout T v) 3212 { 3213 return sqrtEst!Ver(dot4!Ver(v, v)); 3214 } 3215 3216 // 3d normalise estimate 3217 T normEst3(SIMDVer Ver = simdVer, T)(inout T v) 3218 { 3219 return mul!Ver(v, rsqrtEst!Ver(dot3!Ver(v, v))); 3220 } 3221 3222 // 4d normalise estimate 3223 T normEst4(SIMDVer Ver = simdVer, T)(inout T v) 3224 { 3225 return mul!Ver(v, rsqrtEst!Ver(dot4!Ver(v, v))); 3226 } 3227 3228 3229 /////////////////////////////////////////////////////////////////////////////// 3230 // Bitwise operations 3231 3232 // unary complement: ~v 3233 T comp(SIMDVer Ver = simdVer, T)(inout T v) 3234 { 3235 version(X86_OR_X64) 3236 { 3237 return cast(T) ~ cast(int4) v; 3238 } 3239 else version(ARM) 3240 { 3241 static assert(0, "TODO"); 3242 } 3243 else 3244 { 3245 static assert(0, "Unsupported on this architecture"); 3246 } 3247 } 3248 3249 // bitwise or: v1 | v2 3250 T or(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3251 { 3252 version(X86_OR_X64) 3253 { 3254 version(DigitalMars) 3255 { 3256 static if(isOfType!(T, double2)) 3257 return __simd(XMM.ORPD, v1, v2); 3258 else static if(isOfType!(T, float4)) 3259 return __simd(XMM.ORPS, v1, v2); 3260 else 3261 return __simd(XMM.POR, v1, v2); 3262 } 3263 else version(GNU) 3264 { 3265 static if(isOfType!(T, double2)) 3266 return __builtin_ia32_orpd(v1, v2); 3267 else static if(isOfType!(T, float4)) 3268 return __builtin_ia32_orps(v1, v2); 3269 else 3270 return __builtin_ia32_por128(v1, v2); 3271 } 3272 else version(LDC) 3273 { 3274 return cast(T) (cast(int4) v1 | cast(int4) v2); 3275 } 3276 } 3277 else version(ARM) 3278 { 3279 static assert(0, "TODO"); 3280 } 3281 else 3282 { 3283 static assert(0, "Unsupported on this architecture"); 3284 } 3285 } 3286 3287 // bitwise nor: ~(v1 | v2) 3288 T nor(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3289 { 3290 return comp!Ver(or!Ver(v1, v2)); 3291 } 3292 3293 // bitwise and: v1 & v2 3294 T and(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3295 { 3296 version(X86_OR_X64) 3297 { 3298 version(DigitalMars) 3299 { 3300 static if(isOfType!(T, double2)) 3301 return __simd(XMM.ANDPD, v1, v2); 3302 else static if(isOfType!(T, float4)) 3303 return __simd(XMM.ANDPS, v1, v2); 3304 else 3305 return __simd(XMM.PAND, v1, v2); 3306 } 3307 else version(GNU) 3308 { 3309 static if(isOfType!(T, double2)) 3310 return __builtin_ia32_andpd(v1, v2); 3311 else static if(isOfType!(T, float4)) 3312 return __builtin_ia32_andps(v1, v2); 3313 else 3314 return __builtin_ia32_pand128(v1, v2); 3315 } 3316 else version(LDC) 3317 { 3318 return cast(T)(cast(int4) v1 & cast(int4) v2); 3319 } 3320 } 3321 else version(ARM) 3322 { 3323 static assert(0, "TODO"); 3324 } 3325 else 3326 { 3327 static assert(0, "Unsupported on this architecture"); 3328 } 3329 } 3330 3331 // bitwise nand: ~(v1 & v2) 3332 T nand(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3333 { 3334 return comp!Ver(and!Ver(v1, v2)); 3335 } 3336 3337 // bitwise and with not: v1 & ~v2 3338 T andNot(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3339 { 3340 version(X86_OR_X64) 3341 { 3342 version(DigitalMars) 3343 { 3344 static if(isOfType!(T, double2)) 3345 return __simd(XMM.ANDNPD, v2, v1); 3346 else static if(isOfType!(T, float4)) 3347 return __simd(XMM.ANDNPS, v2, v1); 3348 else 3349 return __simd(XMM.PANDN, v2, v1); 3350 } 3351 else version(GNU) 3352 { 3353 static if(isOfType!(T, double2)) 3354 return __builtin_ia32_andnpd(v2, v1); 3355 else static if(isOfType!(T, float4)) 3356 return __builtin_ia32_andnps(v2, v1); 3357 else 3358 return __builtin_ia32_pandn128(v2, v1); 3359 } 3360 else version(LDC) 3361 { 3362 return cast(T)(cast(int4) v1 & ~cast(int4) v2); 3363 } 3364 } 3365 else version(ARM) 3366 { 3367 static assert(0, "TODO"); 3368 } 3369 else 3370 { 3371 static assert(0, "Unsupported on this architecture"); 3372 } 3373 } 3374 3375 // bitwise xor: v1 ^ v2 3376 T xor(SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3377 { 3378 version(X86_OR_X64) 3379 { 3380 version(DigitalMars) 3381 { 3382 static if(isOfType!(T, double2)) 3383 return __simd(XMM.XORPD, v1, v2); 3384 else static if(isOfType!(T, float4)) 3385 return __simd(XMM.XORPS, v1, v2); 3386 else 3387 return __simd(XMM.PXOR, v1, v2); 3388 } 3389 else version(GNU) 3390 { 3391 static if(isOfType!(T, double2)) 3392 return __builtin_ia32_xorpd(v1, v2); 3393 else static if(isOfType!(T, float4)) 3394 return __builtin_ia32_xorps(v1, v2); 3395 else 3396 return __builtin_ia32_pxor128(v1, v2); 3397 } 3398 else version(LDC) 3399 { 3400 return cast(T) (cast(int4) v1 ^ cast(int4) v2); 3401 } 3402 } 3403 else version(ARM) 3404 { 3405 static assert(0, "TODO"); 3406 } 3407 else 3408 { 3409 static assert(0, "Unsupported on this architecture"); 3410 } 3411 } 3412 3413 3414 /////////////////////////////////////////////////////////////////////////////// 3415 // Bit shifts and rotates 3416 3417 // binary shift left 3418 T shiftLeft(SIMDVer Ver = simdVer, T)(inout T v, inout T bits) 3419 { 3420 version(X86_OR_X64) 3421 { 3422 version(DigitalMars) 3423 { 3424 static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 3425 return __simd(XMM.PSLLQ, v, bits); 3426 else static if(isOfType!(T, int4) || isOfType!(T, uint4)) 3427 return __simd(XMM.PSLLD, v, bits); 3428 else static if(isOfType!(T, short8) || isOfType!(T, ushort8)) 3429 return __simd(XMM.PSLLW, v, bits); 3430 else 3431 static assert(0, "Unsupported vector type: " ~ T.stringof); 3432 } 3433 else version(GNU_OR_LDC) 3434 { 3435 static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 3436 return __builtin_ia32_psllq128(v, bits); 3437 else static if(isOfType!(T, int4) || isOfType!(T, uint4)) 3438 return __builtin_ia32_pslld128(v, bits); 3439 else static if(isOfType!(T, short8) || isOfType!(T, ushort8)) 3440 return __builtin_ia32_psllw128(v, bits); 3441 else 3442 static assert(0, "Unsupported vector type: " ~ T.stringof); 3443 } 3444 } 3445 else version(ARM) 3446 { 3447 static assert(0, "TODO"); 3448 } 3449 else 3450 { 3451 static assert(0, "Unsupported on this architecture"); 3452 } 3453 } 3454 3455 // binary shift left by immediate 3456 T shiftLeftImmediate(size_t bits, SIMDVer Ver = simdVer, T)(inout T v) 3457 { 3458 static if(bits == 0) // shift by 0 is a no-op 3459 return v; 3460 else 3461 { 3462 version(X86_OR_X64) 3463 { 3464 version(DigitalMars) 3465 { 3466 static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 3467 return __simd_ib(XMM.PSLLQ, v, bits); 3468 else static if(isOfType!(T, int4) || isOfType!(T, uint4)) 3469 return __simd_ib(XMM.PSLLD, v, bits); 3470 else static if(isOfType!(T, short8) || isOfType!(T, ushort8)) 3471 return __simd_ib(XMM.PSLLW, v, bits); 3472 else 3473 static assert(0, "Unsupported vector type: " ~ T.stringof); 3474 } 3475 else version(GNU_OR_LDC) 3476 { 3477 static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 3478 return __builtin_ia32_psllqi128(v, bits); 3479 else static if(isOfType!(T, int4) || isOfType!(T, uint4)) 3480 return __builtin_ia32_pslldi128(v, bits); 3481 else static if(isOfType!(T, short8) || isOfType!(T, ushort8)) 3482 return __builtin_ia32_psllwi128(v, bits); 3483 else 3484 static assert(0, "Unsupported vector type: " ~ T.stringof); 3485 } 3486 } 3487 else version(ARM) 3488 { 3489 static assert(0, "TODO"); 3490 } 3491 else 3492 { 3493 static assert(0, "Unsupported on this architecture"); 3494 } 3495 } 3496 } 3497 3498 // binary shift right (signed types perform arithmatic shift right) 3499 T shiftRight(SIMDVer Ver = simdVer, T)(inout T v, inout T bits) 3500 { 3501 version(X86_OR_X64) 3502 { 3503 version(DigitalMars) 3504 { 3505 static if(isOfType!(T, ulong2)) 3506 return __simd(XMM.PSRLQ, v, bits); 3507 else static if(isOfType!(T, int4)) 3508 return __simd(XMM.PSRAD, v, bits); 3509 else static if(isOfType!(T, uint4)) 3510 return __simd(XMM.PSRLD, v, bits); 3511 else static if(isOfType!(T, short8)) 3512 return __simd(XMM.PSRAW, v, bits); 3513 else static if(isOfType!(T, ushort8)) 3514 return __simd(XMM.PSRLW, v, bits); 3515 else 3516 static assert(0, "Unsupported vector type: " ~ T.stringof); 3517 } 3518 else version(GNU_OR_LDC) 3519 { 3520 static if(isOfType!(T, ulong2)) 3521 return __builtin_ia32_psrlq128(v, bits); 3522 else static if(isOfType!(T, int4)) 3523 return __builtin_ia32_psrad128(v, bits); 3524 else static if(isOfType!(T, uint4)) 3525 return __builtin_ia32_psrld128(v, bits); 3526 else static if(isOfType!(T, short8)) 3527 return __builtin_ia32_psraw128(v, bits); 3528 else static if(isOfType!(T, ushort8)) 3529 return __builtin_ia32_psrlw128(v, bits); 3530 else 3531 static assert(0, "Unsupported vector type: " ~ T.stringof); 3532 } 3533 } 3534 else version(ARM) 3535 { 3536 static assert(0, "TODO"); 3537 } 3538 else 3539 { 3540 static assert(0, "Unsupported on this architecture"); 3541 } 3542 } 3543 3544 // binary shift right by immediate (signed types perform arithmatic shift right) 3545 T shiftRightImmediate(size_t bits, SIMDVer Ver = simdVer, T)(inout T v) 3546 { 3547 static if(bits == 0) // shift by 0 is a no-op 3548 return v; 3549 else 3550 { 3551 version(X86_OR_X64) 3552 { 3553 version(DigitalMars) 3554 { 3555 static if(isOfType!(T, ulong2)) 3556 return __simd_ib(XMM.PSRLQ, v, bits); 3557 else static if(isOfType!(T, int4)) 3558 return __simd_ib(XMM.PSRAD, v, bits); 3559 else static if(isOfType!(T, uint4)) 3560 return __simd_ib(XMM.PSRLD, v, bits); 3561 else static if(isOfType!(T, short8)) 3562 return __simd_ib(XMM.PSRAW, v, bits); 3563 else static if(isOfType!(T, ushort8)) 3564 return __simd_ib(XMM.PSRLW, v, bits); 3565 else 3566 static assert(0, "Unsupported vector type: " ~ T.stringof); 3567 } 3568 else version(GNU_OR_LDC) 3569 { 3570 static if(isOfType!(T, ulong2)) 3571 return __builtin_ia32_psrlqi128(v, bits); 3572 else static if(isOfType!(T, int4)) 3573 return __builtin_ia32_psradi128(v, bits); 3574 else static if(isOfType!(T, uint4)) 3575 return __builtin_ia32_psrldi128(v, bits); 3576 else static if(isOfType!(T, short8)) 3577 return __builtin_ia32_psrawi128(v, bits); 3578 else static if(isOfType!(T, ushort8)) 3579 return __builtin_ia32_psrlwi128(v, bits); 3580 else 3581 static assert(0, "Unsupported vector type: " ~ T.stringof); 3582 } 3583 } 3584 else version(ARM) 3585 { 3586 static assert(0, "TODO"); 3587 } 3588 else 3589 { 3590 static assert(0, "Unsupported on this architecture"); 3591 } 3592 } 3593 } 3594 3595 // shift bytes left by immediate ('left' as they appear in memory) 3596 T shiftBytesLeftImmediate(size_t bytes, SIMDVer Ver = simdVer, T)(inout T v) 3597 { 3598 static assert(bytes >= 0 && bytes < 16, "Invalid shift amount"); 3599 static if(bytes == 0) // shift by 0 is a no-op 3600 return v; 3601 else 3602 { 3603 version(X86_OR_X64) 3604 { 3605 version(DigitalMars) 3606 { 3607 // little endian reads the bytes into the register in reverse, so we need to flip the operations 3608 return __simd_ib(XMM.PSRLDQ, v, bytes); 3609 } 3610 else version(GNU_OR_LDC) 3611 { 3612 // little endian reads the bytes into the register in reverse, so we need to flip the operations 3613 return cast(T) __builtin_ia32_psrldqi128(cast(ubyte16) v, bytes * 8); // TODO: *8? WAT? 3614 } 3615 } 3616 else version(ARM) 3617 { 3618 static assert(0, "TODO"); 3619 } 3620 else 3621 { 3622 static assert(0, "Unsupported on this architecture"); 3623 } 3624 } 3625 } 3626 3627 // shift bytes right by immediate ('right' as they appear in memory) 3628 T shiftBytesRightImmediate(size_t bytes, SIMDVer Ver = simdVer, T)(inout T v) 3629 { 3630 static assert(bytes >= 0 && bytes < 16, "Invalid shift amount"); 3631 static if(bytes == 0) // shift by 0 is a no-op 3632 return v; 3633 else 3634 { 3635 version(X86_OR_X64) 3636 { 3637 version(DigitalMars) 3638 { 3639 // little endian reads the bytes into the register in reverse, so we need to flip the operations 3640 return __simd_ib(XMM.PSLLDQ, v, bytes); 3641 } 3642 else version(GNU_OR_LDC) 3643 { 3644 // little endian reads the bytes into the register in reverse, so we need to flip the operations 3645 return cast(T) __builtin_ia32_pslldqi128(cast(ubyte16) v, bytes * 8); // TODO: *8? WAT? 3646 } 3647 } 3648 else version(ARM) 3649 { 3650 static assert(0, "TODO"); 3651 } 3652 else 3653 { 3654 static assert(0, "Unsupported on this architecture"); 3655 } 3656 } 3657 } 3658 3659 // shift bytes left by immediate 3660 T rotateBytesLeftImmediate(size_t bytes, SIMDVer Ver = simdVer, T)(inout T v) 3661 { 3662 enum b = bytes & 15; 3663 3664 static if(b == 0) // shift by 0 is a no-op 3665 return v; 3666 else 3667 { 3668 static assert(b >= 0 && b < 16, "Invalid shift amount"); 3669 3670 version(X86_OR_X64) 3671 { 3672 return or!Ver(shiftBytesLeftImmediate!(b, Ver)(v), shiftBytesRightImmediate!(16 - b, Ver)(v)); 3673 } 3674 else 3675 { 3676 static assert(0, "Unsupported on this architecture"); 3677 } 3678 } 3679 } 3680 3681 // shift bytes right by immediate 3682 T rotateBytesRightImmediate(size_t bytes, SIMDVer Ver = simdVer, T)(inout T v) 3683 { 3684 enum b = bytes & 15; 3685 3686 static if(b == 0) // shift by 0 is a no-op 3687 return v; 3688 else 3689 { 3690 static assert(b >= 0 && b < 16, "Invalid shift amount"); 3691 3692 version(X86_OR_X64) 3693 { 3694 return or!Ver(shiftBytesRightImmediate!(b, Ver)(v), shiftBytesLeftImmediate!(16 - b, Ver)(v)); 3695 } 3696 else 3697 { 3698 static assert(0, "Unsupported on this architecture"); 3699 } 3700 } 3701 } 3702 3703 // shift elements left 3704 T shiftElementsLeft(size_t n, SIMDVer Ver = simdVer, T)(inout T v) 3705 { 3706 return shiftBytesLeftImmediate!(n * BaseType!(T).sizeof, Ver)(v); 3707 } 3708 3709 // shift elements right 3710 T shiftElementsRight(size_t n, SIMDVer Ver = simdVer, T)(inout T v) 3711 { 3712 return shiftBytesRightImmediate!(n * BaseType!(T).sizeof, Ver)(v); 3713 } 3714 3715 // shift elements left, shifting elements from v2 into the exposed elements of v1 3716 T shiftElementsLeftPair(size_t n, SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3717 { 3718 static assert(n >= 0 && n <= NumElements!T, "Invalid shift amount"); 3719 3720 static if(n == 0) // shift by 0 is a no-op 3721 return v1; 3722 else static if(n == NumElements!T) // shift by NumElements!T is a no-op 3723 return v2; 3724 else 3725 { 3726 version(X86_OR_X64) 3727 { 3728 /+ TODO: Finish me! 3729 static if(Ver >= SIMDVersion.SSSE3) 3730 { 3731 version(DigitalMars) 3732 return __simd(XMM.PALIGNR, v1, v2, n * BaseType!(T).sizeof); 3733 else version(GNU_OR_LDC) 3734 static assert(false, "TODO: what is the intrinsics?!"); 3735 } 3736 else static if(n == NumElements!T/2) 3737 { 3738 // sine we're splitting in the middle, we can use a shuf 3739 static assert(false, "TODO: create the proper shuffle"); 3740 } 3741 else 3742 +/ 3743 { 3744 return or!Ver(shiftElementsLeft!(n, Ver)(v1), shiftElementsRight!(NumElements!T - n, Ver)(v2)); 3745 } 3746 } 3747 else 3748 { 3749 // TODO: detect opportunities to use shuf instead of shifts... 3750 return or!Ver(shiftElementsLeft!(n, Ver)(v1), shiftElementsRight!(NumElements!T - n, Ver)(v2)); 3751 } 3752 } 3753 } 3754 3755 // shift elements right, shifting elements from v2 into the exposed elements of v1 3756 T shiftElementsRightPair(size_t n, SIMDVer Ver = simdVer, T)(inout T v1, inout T v2) 3757 { 3758 return shiftElementsLeftPair!(NumElements!T-n, Ver)(v2, v1); 3759 } 3760 3761 // rotate elements left 3762 T rotateElementsLeft(size_t n, SIMDVer Ver = simdVer, T)(inout T v) 3763 { 3764 enum e = n & (NumElements!T - 1); // large rotations should wrap 3765 3766 static if(e == 0) // shift by 0 is a no-op 3767 return v; 3768 else 3769 { 3770 version(X86_OR_X64) 3771 { 3772 static if(is64bitElement!T) 3773 { 3774 return swizzle!("YX",Ver)(v); 3775 } 3776 else static if(is32bitElement!T) 3777 { 3778 // we can do this with shuffles more efficiently than rotating bytes 3779 static if(e == 1) 3780 return swizzle!("YZWX",Ver)(v); // X, [Y, Z, W, X], Y, Z, W 3781 static if(e == 2) 3782 return swizzle!("ZWXY",Ver)(v); // X, Y, [Z, W, X, Y], Z, W 3783 static if(e == 3) 3784 return swizzle!("WXYZ",Ver)(v); // X, Y, Z, [W, X, Y, Z], W 3785 } 3786 else 3787 { 3788 // perform the operation as bytes 3789 static if(is16bitElement!T) 3790 enum bytes = e * 2; 3791 else 3792 enum bytes = e; 3793 3794 // we can use a shuf for multiples of 4 bytes 3795 static if((bytes & 3) == 0) 3796 return cast(T)rotateElementsLeft!(bytes >> 2, Ver)(cast(uint4)v); 3797 else 3798 return rotateBytesLeftImmediate!(bytes, Ver)(v); 3799 } 3800 } 3801 else 3802 { 3803 static assert(0, "Unsupported on this architecture"); 3804 } 3805 } 3806 } 3807 3808 // rotate elements right 3809 T rotateElementsRight(size_t n, SIMDVer Ver = simdVer, T)(inout T v) 3810 { 3811 enum e = n & (NumElements!T - 1); // large rotations should wrap 3812 3813 static if(e == 0) // shift by 0 is a no-op 3814 return v; 3815 else 3816 { 3817 // just invert the rotation 3818 return rotateElementsLeft!(NumElements!T - e, Ver)(v); 3819 } 3820 } 3821 3822 3823 /////////////////////////////////////////////////////////////////////////////// 3824 // Comparisons 3825 3826 // true if all elements: r = A[n] == B[n] && A[n+1] == B[n+1] && ... 3827 bool allEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3828 { 3829 return null; 3830 } 3831 3832 // true if all elements: r = A[n] != B[n] && A[n+1] != B[n+1] && ... 3833 bool allNotEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3834 { 3835 return null; 3836 } 3837 3838 // true if all elements: r = A[n] > B[n] && A[n+1] > B[n+1] && ... 3839 bool allGreater(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3840 { 3841 return null; 3842 } 3843 3844 // true if all elements: r = A[n] >= B[n] && A[n+1] >= B[n+1] && ... 3845 bool allGreaterEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3846 { 3847 return null; 3848 } 3849 3850 // true if all elements: r = A[n] < B[n] && A[n+1] < B[n+1] && ... 3851 bool allLess(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3852 { 3853 return null; 3854 } 3855 3856 // true if all elements: r = A[n] <= B[n] && A[n+1] <= B[n+1] && ... 3857 bool allLessEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3858 { 3859 return null; 3860 } 3861 3862 // true if any elements: r = A[n] == B[n] || A[n+1] == B[n+1] || ... 3863 bool anyEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3864 { 3865 return null; 3866 } 3867 3868 // true if any elements: r = A[n] != B[n] || A[n+1] != B[n+1] || ... 3869 bool anyNotEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3870 { 3871 return null; 3872 } 3873 3874 // true if any elements: r = A[n] > B[n] || A[n+1] > B[n+1] || ... 3875 bool anyGreater(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3876 { 3877 return null; 3878 } 3879 3880 // true if any elements: r = A[n] >= B[n] || A[n+1] >= B[n+1] || ... 3881 bool anyGreaterEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3882 { 3883 return null; 3884 } 3885 3886 // true if any elements: r = A[n] < B[n] || A[n+1] < B[n+1] || ... 3887 bool anyLess(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3888 { 3889 return null; 3890 } 3891 3892 // true if any elements: r = A[n] <= B[n] || A[n+1] <= B[n+1] || ... 3893 bool anyLessEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3894 { 3895 return null; 3896 } 3897 3898 3899 /////////////////////////////////////////////////////////////////////////////// 3900 // Generate bit masks 3901 3902 // generate a bitmask of for elements: Rn = An == Bn ? -1 : 0 3903 byte16 maskEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3904 { 3905 version(X86_OR_X64) 3906 { 3907 version(DigitalMars) 3908 { 3909 static if(isOfType!(T, double2)) 3910 return __simd(XMM.CMPPD, a, b, 0); 3911 else static if(isOfType!(T, float4)) 3912 return __simd(XMM.CMPPS, a, b, 0); 3913 else static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 3914 { 3915 static if(Ver >= SIMDVer.SSE41) 3916 return __simd(XMM.PCMPEQQ, a, b); 3917 else 3918 static assert(0, "Only supported in SSE4.1 and above"); 3919 } 3920 else static if(isOfType!(T, int4) || isOfType!(T, uint4)) 3921 return __simd(XMM.PCMPEQD, a, b); 3922 else static if(isOfType!(T, short8) || isOfType!(T, ushort8)) 3923 return __simd(XMM.PCMPEQW, a, b); 3924 else static if(isOfType!(T, byte16) || isOfType!(T, ubyte16)) 3925 return __simd(XMM.PCMPEQB, a, b); 3926 else 3927 static assert(0, "Unsupported vector type: " ~ T.stringof); 3928 } 3929 else version(GNU) 3930 { 3931 static if(isOfType!(T, double2)) 3932 return __builtin_ia32_cmpeqpd(a, b); 3933 else static if(isOfType!(T, float4)) 3934 return __builtin_ia32_cmpeqps(a, b); 3935 else static if(isOfType!(T, long2) || isOfType!(T, ulong2)) 3936 { 3937 static if(Ver >= SIMDVer.SSE41) 3938 return __builtin_ia32_pcmpeqq(a, b); 3939 else 3940 static assert(0, "Only supported in SSE4.1 and above"); 3941 } 3942 else static if(isOfType!(T, int4) || isOfType!(T, uint4)) 3943 return __builtin_ia32_pcmpeqd128(a, b); 3944 else static if(isOfType!(T, short8) || isOfType!(T, ushort8)) 3945 return __builtin_ia32_pcmpeqw128(a, b); 3946 else static if(isOfType!(T, byte16) || isOfType!(T, ubyte16)) 3947 return __builtin_ia32_pcmpeqb128(a, b); 3948 else 3949 static assert(0, "Unsupported vector type: " ~ T.stringof); 3950 } 3951 else version(LDC) 3952 return ldcsimd.equalMask!T(a, b); 3953 } 3954 else version(ARM) 3955 { 3956 static assert(0, "TODO"); 3957 } 3958 else 3959 { 3960 static assert(0, "Unsupported on this architecture"); 3961 } 3962 } 3963 3964 // generate a bitmask of for elements: Rn = An != Bn ? -1 : 0 (SLOW) 3965 void16 maskNotEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 3966 { 3967 version(X86_OR_X64) 3968 { 3969 version(DigitalMars) 3970 { 3971 static if(isOfType!(T, double2)) 3972 return __simd(XMM.CMPPD, a, b, 4); 3973 else static if(isOfType!(T, float4)) 3974 return __simd(XMM.CMPPS, a, b, 4); 3975 else 3976 return comp!Ver(cast(void16)maskEqual!Ver(a, b)); 3977 } 3978 else version(GNU) 3979 { 3980 static if(isOfType!(T, double2)) 3981 return __builtin_ia32_cmpneqpd(a, b); 3982 else static if(isOfType!(T, float4)) 3983 return __builtin_ia32_cmpneqps(a, b); 3984 else 3985 return comp!Ver(cast(void16)maskEqual!Ver(a, b)); 3986 } 3987 else version(LDC) 3988 return ldcsimd.notEqualMask!T(a, b); 3989 } 3990 else version(ARM) 3991 { 3992 static assert(0, "TODO"); 3993 } 3994 else 3995 { 3996 static assert(0, "Unsupported on this architecture"); 3997 } 3998 } 3999 4000 // generate a bitmask of for elements: Rn = An > Bn ? -1 : 0 4001 void16 maskGreater(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 4002 { 4003 version(X86_OR_X64) 4004 { 4005 version(DigitalMars) 4006 { 4007 static if(isOfType!(T, double2)) 4008 return __simd(XMM.CMPPD, b, a, 1); 4009 else static if(isOfType!(T, float4)) 4010 return __simd(XMM.CMPPS, b, a, 1); 4011 else static if(isOfType!(T, long2)) 4012 return __simd(XMM.PCMPGTQ, a, b); 4013 else static if(isOfType!(T, ulong2)) 4014 return __simd(XMM.PCMPGTQ, a + signMask2, b + signMask2); 4015 else static if(isOfType!(T, int4)) 4016 return __simd(XMM.PCMPGTD, a, b); 4017 else static if(isOfType!(T, uint4)) 4018 return __simd(XMM.PCMPGTD, a + signMask4, b + signMask4); 4019 else static if(isOfType!(T, short8)) 4020 return __simd(XMM.PCMPGTW, a, b); 4021 else static if(isOfType!(T, ushort8)) 4022 return __simd(XMM.PCMPGTW, a + signMask8, b + signMask8); 4023 else static if(isOfType!(T, byte16)) 4024 return __simd(XMM.PCMPGTB, a, b); 4025 else static if(isOfType!(T, ubyte16)) 4026 return __simd(XMM.PCMPGTB, a + signMask16, b + signMask16); 4027 else 4028 static assert(0, "Unsupported vector type: " ~ T.stringof); 4029 } 4030 else version(GNU) 4031 { 4032 static if(isOfType!(T, double2)) 4033 return __builtin_ia32_cmpgtpd(a, b); 4034 else static if(isOfType!(T, float4)) 4035 return __builtin_ia32_cmpgtps(a, b); 4036 else static if(isOfType!(T, long2)) 4037 return __builtin_ia32_pcmpgtq(a, b); 4038 else static if(isOfType!(T, ulong2)) 4039 return __builtin_ia32_pcmpgtq(a + signMask2, b + signMask2); 4040 else static if(isOfType!(T, int4)) 4041 return __builtin_ia32_pcmpgtd128(a, b); 4042 else static if(isOfType!(T, uint4)) 4043 return __builtin_ia32_pcmpgtd128(a + signMask4, b + signMask4); 4044 else static if(isOfType!(T, short8)) 4045 return __builtin_ia32_pcmpgtw128(a, b); 4046 else static if(isOfType!(T, ushort8)) 4047 return __builtin_ia32_pcmpgtw128(a + signMask8, b + signMask8); 4048 else static if(isOfType!(T, byte16)) 4049 return __builtin_ia32_pcmpgtb128(a, b); 4050 else static if(isOfType!(T, ubyte16)) 4051 return __builtin_ia32_pcmpgtb128(a + signMask16, b + signMask16); 4052 else 4053 static assert(0, "Unsupported vector type: " ~ T.stringof); 4054 } 4055 else version(LDC) 4056 return ldcsimd.greaterMask!T(a, b); 4057 } 4058 else version(ARM) 4059 { 4060 static assert(0, "TODO"); 4061 } 4062 else 4063 { 4064 static assert(0, "Unsupported on this architecture"); 4065 } 4066 } 4067 4068 // generate a bitmask of for elements: Rn = An >= Bn ? -1 : 0 (SLOW) 4069 void16 maskGreaterEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 4070 { 4071 version(X86_OR_X64) 4072 { 4073 version(DigitalMars) 4074 { 4075 static if(isOfType!(T, double2)) 4076 return __simd(XMM.CMPPD, b, a, 2); 4077 else static if(isOfType!(T, float4)) 4078 return __simd(XMM.CMPPS, b, a, 2); 4079 else 4080 return or!Ver(cast(void16)maskGreater!Ver(a, b), cast(void16)maskEqual!Ver(a, b)); // compound greater OR equal 4081 } 4082 else version(GNU) 4083 { 4084 static if(isOfType!(T, double2)) 4085 return __builtin_ia32_cmpgepd(a, b); 4086 else static if(isOfType!(T, float4)) 4087 return __builtin_ia32_cmpgeps(a, b); 4088 else 4089 return or!Ver(cast(void16)maskGreater!Ver(a, b), cast(void16)maskEqual!Ver(a, b)); // compound greater OR equal 4090 } 4091 else version(LDC) 4092 return ldcsimd.greaterOrEqualMask!T(a, b); 4093 } 4094 else version(ARM) 4095 { 4096 static assert(0, "TODO"); 4097 } 4098 else 4099 { 4100 static assert(0, "Unsupported on this architecture"); 4101 } 4102 } 4103 4104 // generate a bitmask of for elements: Rn = An < Bn ? -1 : 0 (SLOW) 4105 void16 maskLess(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 4106 { 4107 version(X86_OR_X64) 4108 { 4109 version(DigitalMars) 4110 { 4111 static if(isOfType!(T, double2)) 4112 return __simd(XMM.CMPPD, a, b, 1); 4113 else static if(isOfType!(T, float4)) 4114 return __simd(XMM.CMPPS, a, b, 1); 4115 else 4116 return maskGreater!Ver(b, a); // reverse the args 4117 } 4118 else version(GNU) 4119 { 4120 static if(isOfType!(T, double2)) 4121 return __builtin_ia32_cmpltpd(a, b); 4122 else static if(isOfType!(T, float4)) 4123 return __builtin_ia32_cmpltps(a, b); 4124 else 4125 return maskGreater!Ver(b, a); // reverse the args 4126 } 4127 else version(LDC) 4128 return ldcsimd.greaterMask!T(b, a); 4129 } 4130 else version(ARM) 4131 { 4132 static assert(0, "TODO"); 4133 } 4134 else 4135 { 4136 static assert(0, "Unsupported on this architecture"); 4137 } 4138 } 4139 4140 // generate a bitmask of for elements: Rn = An <= Bn ? -1 : 0 4141 void16 maskLessEqual(SIMDVer Ver = simdVer, T)(inout T a, inout T b) 4142 { 4143 version(X86_OR_X64) 4144 { 4145 version(DigitalMars) 4146 { 4147 static if(isOfType!(T, double2)) 4148 return __simd(XMM.CMPPD, a, b, 2); 4149 else static if(isOfType!(T, float4)) 4150 return __simd(XMM.CMPPS, a, b, 2); 4151 else 4152 return maskGreaterEqual!Ver(b, a); // reverse the args 4153 } 4154 else version(GNU) 4155 { 4156 static if(isOfType!(T, double2)) 4157 return __builtin_ia32_cmplepd(a, b); 4158 else static if(isOfType!(T, float4)) 4159 return __builtin_ia32_cmpleps(a, b); 4160 else 4161 return maskGreaterEqual!Ver(b, a); // reverse the args 4162 } 4163 else version(LDC) 4164 return ldcsimd.greaterOrEqualMask!T(b, a); 4165 } 4166 else version(ARM) 4167 { 4168 static assert(0, "TODO"); 4169 } 4170 else 4171 { 4172 static assert(0, "Unsupported on this architecture"); 4173 } 4174 } 4175 4176 4177 /////////////////////////////////////////////////////////////////////////////// 4178 // Branchless selection 4179 4180 // select elements according to: mask == true ? x : y 4181 T select(SIMDVer Ver = simdVer, T)(void16 mask, inout T x, inout T y) 4182 { 4183 version(X86_OR_X64) 4184 { 4185 version(DigitalMars) 4186 { 4187 static if(Ver >= SIMDVer.SSE41) 4188 { 4189 static if(isOfType!(T, double2)) 4190 return __simd(XMM.BLENDVPD, y, x, mask); 4191 else static if(isOfType!(T, float4)) 4192 return __simd(XMM.BLENDVPS, y, x, mask); 4193 else 4194 return __simd(XMM.PBLENDVB, y, x, mask); 4195 } 4196 else 4197 return xor!Ver(cast(void16)x, and!Ver(mask, xor!Ver(cast(void16)y, cast(void16)x))); 4198 } 4199 else version(GNU_OR_LDC) 4200 { 4201 static if(Ver >= SIMDVer.SSE41) 4202 { 4203 static if(isOfType!(T, double2)) 4204 return __builtin_ia32_blendvpd(y, x, cast(double2)mask); 4205 else static if(isOfType!(T, float4)) 4206 return __builtin_ia32_blendvps(y, x, cast(float4)mask); 4207 else 4208 { 4209 alias PblendvbParam P; 4210 return cast(R)__builtin_ia32_pblendvb128(cast(P)y, cast(P)x, cast(P)mask); 4211 } 4212 } 4213 else 4214 return xor!Ver(cast(void16)x, and!Ver(mask, xor!Ver(cast(void16)y, cast(void16)x))); 4215 } 4216 } 4217 else version(ARM) 4218 { 4219 static assert(0, "TODO"); 4220 } 4221 else 4222 { 4223 // simulate on any architecture without an opcode: ((b ^ a) & mask) ^ a 4224 return xor!Ver(cast(void16)x, and!Ver(mask, xor!Ver(cast(void16)y, cast(void16)x))); 4225 } 4226 } 4227 4228 // select elements: Rn = An == Bn ? Xn : Yn 4229 U selectEqual(SIMDVer Ver = simdVer, T, U)(inout T a, inout T b, inout U x, inout U y) 4230 { 4231 return select!Ver(maskEqual!Ver(a, b), x, y); 4232 } 4233 4234 // select elements: Rn = An != Bn ? Xn : Yn 4235 U selectNotEqual(SIMDVer Ver = simdVer, T, U)(inout T a, inout T b, inout U x, inout U y) 4236 { 4237 return select!Ver(maskNotEqual!Ver(a, b), x, y); 4238 } 4239 4240 // select elements: Rn = An > Bn ? Xn : Yn 4241 U selectGreater(SIMDVer Ver = simdVer, T, U)(inout T a, inout T b, inout U x, inout U y) 4242 { 4243 return select!Ver(maskGreater!Ver(a, b), x, y); 4244 } 4245 4246 // select elements: Rn = An >= Bn ? Xn : Yn 4247 U selectGreaterEqual(SIMDVer Ver = simdVer, T, U)(inout T a, inout T b, inout U x, inout U y) 4248 { 4249 return select!Ver(maskGreaterEqual!Ver(a, b), x, y); 4250 } 4251 4252 // select elements: Rn = An < Bn ? Xn : Yn 4253 U selectLess(SIMDVer Ver = simdVer, T, U)(inout T a, inout T b, inout U x, inout U y) 4254 { 4255 return select!Ver(maskLess!Ver(a, b), x, y); 4256 } 4257 4258 // select elements: Rn = An <= Bn ? Xn : Yn 4259 U selectLessEqual(SIMDVer Ver = simdVer, T, U)(inout T a, inout T b, inout U x, inout U y) 4260 { 4261 return select!Ver(maskLessEqual!Ver(a, b), x, y); 4262 } 4263 4264 4265 /////////////////////////////////////////////////////////////////////////////// 4266 // Matrix API 4267 4268 // define a/some matrix type(s) 4269 //... 4270 4271 struct float4x4 4272 { 4273 float4 xRow; 4274 float4 yRow; 4275 float4 zRow; 4276 float4 wRow; 4277 } 4278 4279 struct double2x2 4280 { 4281 double2 xRow; 4282 double2 yRow; 4283 } 4284 4285 /////////////////////////////////////////////////////////////////////////////// 4286 // Matrix functions 4287 4288 T transpose(SIMDVer Ver = simdVer, T)(inout T m) 4289 { 4290 version(X86_OR_X64) 4291 { 4292 version(DigitalMars) 4293 { 4294 static assert(0, "TODO"); 4295 } 4296 else version(GNU) 4297 { 4298 static if(isOfType!(T, float4x4)) 4299 { 4300 float4 b0 = __builtin_ia32_shufps(m.xRow, m.yRow, shufMask!(0,1,0,1)); 4301 float4 b1 = __builtin_ia32_shufps(m.zRow, m.wRow, shufMask!(0,1,0,1)); 4302 float4 b2 = __builtin_ia32_shufps(m.xRow, m.yRow, shufMask!(2,3,2,3)); 4303 float4 b3 = __builtin_ia32_shufps(m.zRow, m.wRow, shufMask!(2,3,2,3)); 4304 float4 a0 = __builtin_ia32_shufps(b0, b1, shufMask!(0,2,0,2)); 4305 float4 a1 = __builtin_ia32_shufps(b2, b3, shufMask!(0,2,0,2)); 4306 float4 a2 = __builtin_ia32_shufps(b0, b1, shufMask!(1,3,1,3)); 4307 float4 a3 = __builtin_ia32_shufps(b2, b3, shufMask!(1,3,1,3)); 4308 4309 return float4x4(a0, a2, a1, a3); 4310 } 4311 else static if (isOfType!(T, double2x2)) 4312 { 4313 static if(Ver >= SIMDVer.SSE2) 4314 { 4315 return double2x2( 4316 __builtin_ia32_unpcklpd(m.xRow, m.yRow), 4317 __builtin_ia32_unpckhpd(m.xRow, m.yRow)); 4318 } 4319 else 4320 static assert(0, "TODO"); 4321 } 4322 else 4323 static assert(0, "Unsupported matrix type: " ~ T.stringof); 4324 } 4325 } 4326 else 4327 { 4328 static assert(0, "Unsupported on this architecture"); 4329 } 4330 } 4331 4332 4333 // determinant, etc... 4334 4335 4336 4337 /////////////////////////////////////////////////////////////////////////////// 4338 // Unit test the lot! 4339 /* 4340 unittest 4341 { 4342 import std.traits; 4343 import std.typetuple; 4344 import std.conv; 4345 4346 template staticIota(int start, int end, int stride = 1) 4347 { 4348 static if(start >= end) 4349 alias staticIota = TypeTuple!(); 4350 else 4351 alias staticIota = TypeTuple!(start, staticIota!(start + stride, end, stride)); 4352 } 4353 4354 template staticRepeat(int n, a...) if(a.length == 1) 4355 { 4356 static if(n <= 0) 4357 alias staticRepeat = TypeTuple!(); 4358 else 4359 alias staticRepeat = TypeTuple!(a, staticRepeat!(n - 1, a)); 4360 } 4361 4362 void testver(SIMDVer Ver)() 4363 { 4364 import std.math; 4365 4366 T Clamp(T, U)(T a, U x, T b) 4367 { 4368 return cast(T)(a > x ? a : (x > b ? b : x)); 4369 } 4370 4371 alias SignedInts = TypeTuple!(long, int, short, byte); 4372 alias UnsignedInts = TypeTuple!(ulong, uint, ushort, ubyte); 4373 alias Ints = TypeTuple!(SignedInts, UnsignedInts); 4374 alias Floats = TypeTuple!(float, double); 4375 alias Signed = TypeTuple!(Floats, SignedInts); 4376 alias All = TypeTuple!(Floats, Ints); 4377 4378 testTypes!(Ver, false, byElement, std.simd.abs, (a) => a < 0 ? -a : a, All)(); 4379 testTypes!(Ver, false, byElement, neg, (a) => -a, All)(); 4380 testTypes!(Ver, false, byElement, add, (a, b) => a + b, All)(); 4381 testTypes!(Ver, false, byElement, addSaturate, (a, b) => Clamp(typeof(a).min, a + b, typeof(a).max), Ints)(); 4382 testTypes!(Ver, false, byElement, sub, (a, b) => a - b, All)(); 4383 testTypes!(Ver, false, byElement, subSaturate, (a, b) => Clamp(typeof(a).min, a - b, typeof(a).max), Ints)(); 4384 testTypes!(Ver, false, byElement, mul, (a, b) => a * b, All)(); 4385 testTypes!(Ver, false, byElement, madd, (a, b, c) => a*b + c, All)(); 4386 testTypes!(Ver, false, byElement, msub, (a, b, c) => a*b - c, All)(); 4387 testTypes!(Ver, false, byElement, nmadd, (a, b, c) => -a*b + c, All)(); 4388 testTypes!(Ver, false, byElement, nmsub, (a, b, c) => -a*b - c, All)(); 4389 testTypes!(Ver, false, byElement, min, (a, b) => a < b ? a : b, All)(); 4390 testTypes!(Ver, false, byElement, max, (a, b) => a > b ? a : b, All)(); 4391 testTypes!(Ver, false, byElement, clamp, (a, v, b) => Clamp(a, v, b), All)(); 4392 testTypes!(Ver, false, byElement, lerp, (a, b, t) => (b-a)*t + a, All)(); 4393 testTypes!(Ver, false, byElement, comp, (a) => ~a, Ints)(); 4394 testTypes!(Ver, false, byElement, or, (a, b) => a | b, Ints)(); 4395 testTypes!(Ver, false, byElement, nor, (a, b) => ~(a | b), Ints)(); 4396 testTypes!(Ver, false, byElement, and, (a, b) => a & b, Ints)(); 4397 testTypes!(Ver, false, byElement, nand, (a, b) => ~(a & b), Ints)(); 4398 testTypes!(Ver, false, byElement, andNot, (a, b) => a & ~b, Ints)(); 4399 testTypes!(Ver, false, byElement, xor, (a, b) => a ^ b, Ints)(); 4400 4401 testTypes!(Ver, false, byElement, div, (a, b) => a / b, Floats)(); 4402 testTypes!(Ver, false, byElement, rcp, (a) => 1.0/a, Floats)(); 4403 testTypes!(Ver, false, byElement, std.simd.sqrt, (a) => std.math.sqrt(a), Floats)(); 4404 testTypes!(Ver, false, byElement, rsqrt, (a) => 1.0/std.math.sqrt(a), Floats)(); 4405 testTypes!(Ver, true, byElement, divEst, (a, b) => a / b, Floats)(); 4406 testTypes!(Ver, true, byElement, rcpEst, (a) => 1.0/a, Floats)(); 4407 testTypes!(Ver, true, byElement, sqrtEst, (a) => std.math.sqrt(a), Floats)(); 4408 testTypes!(Ver, true, byElement, rsqrtEst, (a) => 1.0/std.math.sqrt(a), Floats)(); 4409 4410 testTypes!(Ver, false, byElement, std.simd.floor, (a) => std.math.floor(a), Floats)(); 4411 testTypes!(Ver, false, byElement, std.simd.ceil, (a) => std.math.ceil(a), Floats)(); 4412 testTypes!(Ver, false, byElement, std.simd.round, (a) => std.math.round(a), Floats)(); 4413 testTypes!(Ver, false, byElement, std.simd.trunc, (a) => std.math.trunc(a), Floats)(); 4414 4415 testTypes!(Ver, false, byVector, dot2, (a, b) => a[0]*b[0] + a[1]*b[1], Floats)(); 4416 testTypes!(Ver, false, byVector, dot3, (a, b) => a[0]*b[0] + a[1]*b[1] + a[2]*b[2], float)(); 4417 testTypes!(Ver, false, byVector, dot4, (a, b) => a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3], float)(); 4418 testTypes!(Ver, false, byVector, dotH, (a, b) => a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + b[3], float)(); 4419 testTypes!(Ver, false, byVector, cross3, (a, b) => [ a[1]*b[2] - a[2]*b[1], a[2]*b[0] - a[0]*b[2], a[0]*b[1] - a[1]*b[0], 0], float)(); 4420 testTypes!(Ver, false, byVector, magnitude3, (a) => std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2]), float)(); 4421 testTypes!(Ver, false, byVector, magnitude4, (a) => std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2] + a[3]*a[3]), float)(); 4422 testTypes!(Ver, false, byVector, normalise3, (a) { float l = 1/std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2]); return [ a[0]*l, a[1]*l, a[2]*l, a[3]*l ]; }, float)(); 4423 testTypes!(Ver, false, byVector, normalise4, (a) { float l = 1/std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2] + a[3]*a[3]); return [ a[0]*l, a[1]*l, a[2]*l, a[3]*l ]; }, float)(); 4424 testTypes!(Ver, false, byVector, magSq3, (a) => a[0]*a[0] + a[1]*a[1] + a[2]*a[2], float)(); 4425 testTypes!(Ver, false, byVector, magSq4, (a) => a[0]*a[0] + a[1]*a[1] + a[2]*a[2] + a[3]*a[3], float)(); 4426 testTypes!(Ver, true, byVector, magEst3, (a) => std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2]), float)(); 4427 testTypes!(Ver, true, byVector, magEst4, (a) => std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2] + a[3]*a[3]), float)(); 4428 testTypes!(Ver, true, byVector, normEst3, (a) { float l = 1/std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2]); return [ a[0]*l, a[1]*l, a[2]*l, a[3]*l ]; }, float)(); 4429 testTypes!(Ver, true, byVector, normEst4, (a) { float l = 1/std.math.sqrt(a[0]*a[0] + a[1]*a[1] + a[2]*a[2] + a[3]*a[3]); return [ a[0]*l, a[1]*l, a[2]*l, a[3]*l ]; }, float)(); 4430 4431 //shiftLeft 4432 //shiftLeftImmediate 4433 //shiftRight 4434 //shiftRightImmediate 4435 //shiftBytesLeftImmediate 4436 //shiftBytesRightImmediate 4437 //rotateBytesLeftImmediate 4438 //rotateBytesRightImmediate 4439 //shiftElementsLeft 4440 //shiftElementsRight 4441 //shiftElementsLeftPair 4442 //shiftElementsRightPair 4443 //rotateElementsLeft 4444 //rotateElementsRight 4445 4446 //loadScalar 4447 //loadUnaligned 4448 //getScalar 4449 //storeScalar 4450 //storeUnaligned 4451 //getX 4452 //getY 4453 //getZ 4454 //getW 4455 //setX 4456 //setY 4457 //setZ 4458 //setW 4459 //swizzle 4460 //permute 4461 //interleaveLow 4462 //interleaveHigh 4463 4464 //unpackLow 4465 //unpackHigh 4466 //pack 4467 //packSaturate 4468 4469 //toInt 4470 //toFloat 4471 //toDouble 4472 4473 } 4474 4475 // check for CPU support before calling each function... 4476 testver!(SIMDVer.SSE); 4477 testver!(SIMDVer.SSE2); 4478 testver!(SIMDVer.SSE3); 4479 testver!(SIMDVer.SSSE3); 4480 testver!(SIMDVer.SSE41); 4481 testver!(SIMDVer.SSE42); 4482 // testver!(SIMDVer.SSE4a); 4483 // testver!(SIMDVer.SSE5); 4484 // testver!(SIMDVer.AVX); 4485 // testver!(SIMDVer.AVX2); 4486 // testver!(SIMDVer.AVX512); 4487 } 4488 4489 4490 version (unittest) 4491 { 4492 import std.random; 4493 import std.math; 4494 4495 void testTypes(SIMDVer Ver, bool approx, alias testFunc, alias f, alias l, Types...)() 4496 { 4497 // for each type 4498 foreach(T; Types) 4499 { 4500 // work out which vector widths are relevant 4501 version(X86_OR_X64) 4502 { 4503 static if(Ver >= SIMDVer.AVX512) 4504 alias Widths = TypeTuple!(128, 256, 512); 4505 static if(Ver >= SIMDVer.AVX) 4506 alias Widths = TypeTuple!(128, 256); 4507 else 4508 alias Widths = TypeTuple!(128); 4509 } 4510 else 4511 alias Widths = TypeTuple!(128); 4512 4513 // for each vector width 4514 foreach(w; Widths) 4515 { 4516 auto rng = Xorshift128(w); 4517 4518 // work out __vector type 4519 enum numElements = w/(T.sizeof*8); 4520 alias V = __vector(T[numElements]); 4521 4522 // compile for the right number of args based on whether the function conpiles 4523 V t; 4524 static if(__traits(compiles, f!Ver(t))) 4525 { 4526 foreach(i; 0..16) 4527 testFunc!(Ver, approx, f, l)(randomVector!V(i, rng)); 4528 } 4529 else static if(__traits(compiles, f!Ver(t, t))) 4530 { 4531 foreach(i; 0..16) 4532 testFunc!(Ver, approx, f, l)(randomVector!V(i, rng), randomVector!V(i, rng)); 4533 } 4534 else static if(__traits(compiles, f!Ver(t, t, t))) 4535 { 4536 foreach(i; 0..16) 4537 testFunc!(Ver, approx, f, l)(randomVector!V(i, rng), randomVector!V(i, rng), randomVector!V(i, rng)); 4538 } 4539 else static if(__traits(compiles, f!Ver(t, t, t, t))) 4540 { 4541 foreach(i; 0..16) 4542 testFunc!(Ver, approx, f, l)(randomVector!V(i, rng), randomVector!V(i, rng), randomVector!V(i, rng), randomVector!V(i, rng)); 4543 } 4544 else 4545 pragma(msg, "Unsupported: " ~ f.stringof ~ " with: " ~ V.stringof ~ " " ~ Ver.stringof); 4546 } 4547 } 4548 } 4549 4550 T randomVector(T, Rng)(int seed, ref Rng rng) 4551 { 4552 alias ET = ElementType!T; 4553 4554 T r = void; 4555 foreach(ref e; r.array) 4556 { 4557 static if(isFloatingPoint!ET) 4558 e = uniform(cast(ET)-3.0, cast(ET)3.0, rng)^^5.0; 4559 else 4560 e = uniform(ET.min, ET.max, rng); 4561 } 4562 return r; 4563 } 4564 4565 void byElement(SIMDVer Ver, bool approx, alias f, alias l, T...)(T v) 4566 { 4567 alias BT = ElementType!(T[0]); 4568 4569 auto r = f!Ver(v); 4570 4571 typeof(v[0].array) r2 = void; 4572 foreach(i; staticIota!(0, r2.length)) 4573 { 4574 // TODO: can't make a template work in this case >_< 4575 static if(v.length == 1) 4576 r2[i] = cast(BT)l(v[0].array[i]); 4577 else static if(v.length == 2) 4578 r2[i] = cast(BT)l(v[0].array[i], v[1].array[i]); 4579 else static if(v.length == 3) 4580 r2[i] = cast(BT)l(v[0].array[i], v[1].array[i], v[2].array[i]); 4581 } 4582 4583 assert(eq!(approx)(r.array, r2), "Incorrect result in function: " ~ f.stringof ~ " for type: " ~ T[0].stringof ~ " with SIMD Ver: " ~ Ver.stringof); 4584 } 4585 4586 void byVector(SIMDVer Ver, bool approx, alias f, alias l, T...)(T v) 4587 { 4588 auto r = f!Ver(v); 4589 4590 typeof(v[0].array) r2 = void; 4591 4592 // TODO: can't make a template work in this case >_< 4593 static if(v.length == 1) 4594 r2 = l(v[0].array); 4595 else static if(v.length == 2) 4596 r2 = l(v[0].array, v[1].array); 4597 else static if(v.length == 3) 4598 r2 = l(v[0].array, v[1].array, v[2].array); 4599 4600 assert(eq!(approx)(r.array, r2)); 4601 } 4602 4603 bool eq(bool approx = false, T)(T a, T b) 4604 { 4605 static if(isIntegral!T || is(T == bool)) 4606 { 4607 return a == b; 4608 } 4609 else static if(isFloatingPoint!T) 4610 { 4611 if(a.isNaN && b.isNaN) 4612 return true; 4613 return feqrel(a, b) + 3 >= (approx ? T.mant_dig / 2 : T.mant_dig); 4614 } 4615 else static if(isStaticArray!T) 4616 { 4617 foreach(i; staticIota!(0, T.length)) 4618 if(!eq!approx(a[i], b[i])) 4619 return false; 4620 return true; 4621 } 4622 } 4623 } 4624 */