1 /**
2 * The core.internal.atomic module comtains the low-level atomic features available in hardware.
3 * This module may be a routing layer for compiler intrinsics.
4 *
5 * Copyright: Copyright Manu Evans 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Authors:   Sean Kelly, Alex Rønne Petersen, Manu Evans
8 * Source:    $(DRUNTIMESRC core/internal/_atomic.d)
9 */
10 
11 module core.internal.atomic;
12 
13 import core.atomic : MemoryOrder, has128BitCAS;
14 
15 version (LDC)
16 {
17     import ldc.intrinsics;
18 
19     pragma(inline, true):
20 
21     inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
22     {
23         alias A = _AtomicType!T;
24         A result = llvm_atomic_load!A(cast(shared A*) src, _ordering!(order));
25         return *cast(inout(T)*) &result;
26     }
27 
28     void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
29     {
30         alias A = _AtomicType!T;
31         llvm_atomic_store!A(*cast(A*) &value, cast(shared A*) dest, _ordering!(order));
32     }
33 
34     T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
35     {
36         alias A = _AtomicType!T;
37         return llvm_atomic_rmw_add!A(cast(shared A*) dest, value, _ordering!(order));
38     }
39 
40     T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
41     {
42         alias A = _AtomicType!T;
43         return llvm_atomic_rmw_sub!A(cast(shared A*) dest, value, _ordering!(order));
44     }
45 
46     T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
47     {
48         alias A = _AtomicType!T;
49         A result = llvm_atomic_rmw_xchg!A(cast(shared A*) dest, *cast(A*) &value, _ordering!(order));
50         return *cast(T*) &result;
51     }
52 
53     bool atomicCompareExchange(bool weak = false, MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
54     {
55         alias A = _AtomicType!T;
56         auto result = llvm_atomic_cmp_xchg!A(cast(shared A*) dest, *cast(A*) compare, *cast(A*) &value,
57             _ordering!(succ), _ordering!(fail), weak);
58         *compare = *cast(T*) &result.previousValue;
59         return result.exchanged;
60     }
61     bool atomicCompareExchangeWeak(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
62     {
63         return atomicCompareExchange!(true, succ, fail, T)(dest, compare, value);
64     }
65     bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
66     {
67         return atomicCompareExchange!(false, succ, fail, T)(dest, compare, value);
68     }
69 
70     bool atomicCompareExchangeNoResult(bool weak = false, MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
71     {
72         alias A = _AtomicType!T;
73         auto result = llvm_atomic_cmp_xchg!A(cast(shared A*) dest, *cast(A*) &compare, *cast(A*) &value,
74             _ordering!(succ), _ordering!(fail), weak);
75         return result.exchanged;
76     }
77     bool atomicCompareExchangeWeakNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
78     {
79         return atomicCompareExchangeNoResult!(true, succ, fail, T)(dest, compare, value);
80     }
81     bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
82     {
83         return atomicCompareExchangeNoResult!(false, succ, fail, T)(dest, compare, value);
84     }
85 
86     void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
87     {
88         llvm_memory_fence(_ordering!(order));
89     }
90 
91     void pause() pure nothrow @nogc @trusted
92     {
93         version (X86)
94             enum inst = "pause";
95         else version (X86_64)
96             enum inst = "pause";
97         else version (ARM)
98         {
99             // requires v6k+ (e.g., -mtriple=armv6k-linux-gnueabihf)
100             static if (__traits(targetHasFeature, "v6k"))
101                 enum inst = "yield";
102             else
103                 enum inst = null;
104         }
105         else version (AArch64)
106             enum inst = "yield";
107         else version (MIPS32)
108         {
109             // requires ISA r2+ (e.g., -mcpu=mips32r2)
110             static if (__traits(targetHasFeature, "mips32r2"))
111                 enum inst = "pause";
112             else
113                 enum inst = null;
114         }
115         else version (MIPS64)
116         {
117             // requires ISA r2+ (e.g., -mcpu=mips64r2)
118             static if (__traits(targetHasFeature, "mips64r2"))
119                 enum inst = "pause";
120             else
121                 enum inst = null;
122         }
123         else
124             enum inst = null; // TODO?
125 
126         static if (inst !is null)
127             asm pure nothrow @nogc @trusted { (inst); }
128     }
129 
130     template _ordering(MemoryOrder ms)
131     {
132         static if (ms == MemoryOrder.acq)
133             enum _ordering = AtomicOrdering.Acquire;
134         else static if (ms == MemoryOrder.rel)
135             enum _ordering = AtomicOrdering.Release;
136         else static if (ms == MemoryOrder.acq_rel)
137             enum _ordering = AtomicOrdering.AcquireRelease;
138         else static if (ms == MemoryOrder.seq)
139             enum _ordering = AtomicOrdering.SequentiallyConsistent;
140         else static if (ms == MemoryOrder.raw)
141         {
142             // Note that C/C++ 'relaxed' is not the same as NoAtomic/Unordered,
143             // but Monotonic.
144             enum _ordering = AtomicOrdering.Monotonic;
145         }
146         else
147             static assert(0);
148     }
149 
150     private template _AtomicType(T)
151     {
152         static if (T.sizeof == ubyte.sizeof)
153             alias _AtomicType = ubyte;
154         else static if (T.sizeof == ushort.sizeof)
155             alias _AtomicType = ushort;
156         else static if (T.sizeof == uint.sizeof)
157             alias _AtomicType = uint;
158         else static if (T.sizeof == ulong.sizeof)
159             alias _AtomicType = ulong;
160         else static if (T.sizeof == 2*ulong.sizeof && has128BitCAS)
161         {
162             struct UCent
163             {
164                 ulong value1;
165                 ulong value2;
166             }
167 
168             alias _AtomicType = UCent;
169         }
170         else
171             static assert(is(_AtomicType!T),
172                 "Cannot atomically load/store type of size " ~ T.sizeof.stringof);
173     }
174 }
175 else: // !LDC
176 
177 version (DigitalMars)
178 {
179     private
180     {
181         enum : int
182         {
183             AX, BX, CX, DX, DI, SI, R8, R9
184         }
185 
186         immutable string[4][8] registerNames = [
187             [ "AL", "AX", "EAX", "RAX" ],
188             [ "BL", "BX", "EBX", "RBX" ],
189             [ "CL", "CX", "ECX", "RCX" ],
190             [ "DL", "DX", "EDX", "RDX" ],
191             [ "DIL", "DI", "EDI", "RDI" ],
192             [ "SIL", "SI", "ESI", "RSI" ],
193             [ "R8B", "R8W", "R8D", "R8" ],
194             [ "R9B", "R9W", "R9D", "R9" ],
195         ];
196 
197         template RegIndex(T)
198         {
199             static if (T.sizeof == 1)
200                 enum RegIndex = 0;
201             else static if (T.sizeof == 2)
202                 enum RegIndex = 1;
203             else static if (T.sizeof == 4)
204                 enum RegIndex = 2;
205             else static if (T.sizeof == 8)
206                 enum RegIndex = 3;
207             else
208                 static assert(false, "Invalid type");
209         }
210 
211         enum SizedReg(int reg, T = size_t) = registerNames[reg][RegIndex!T];
212     }
213 
214     inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
215         if (CanCAS!T)
216     {
217         static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
218 
219         static if (T.sizeof == size_t.sizeof * 2)
220         {
221             version (D_InlineAsm_X86)
222             {
223                 asm pure nothrow @nogc @trusted
224                 {
225                     push EDI;
226                     push EBX;
227                     mov EBX, 0;
228                     mov ECX, 0;
229                     mov EAX, 0;
230                     mov EDX, 0;
231                     mov EDI, src;
232                     lock; cmpxchg8b [EDI];
233                     pop EBX;
234                     pop EDI;
235                 }
236             }
237             else version (D_InlineAsm_X86_64)
238             {
239                 version (Windows)
240                 {
241                     static if (RegisterReturn!T)
242                     {
243                         enum SrcPtr = SizedReg!CX;
244                         enum RetPtr = null;
245                     }
246                     else
247                     {
248                         enum SrcPtr = SizedReg!DX;
249                         enum RetPtr = SizedReg!CX;
250                     }
251 
252                     mixin (simpleFormat(q{
253                         asm pure nothrow @nogc @trusted
254                         {
255                             naked;
256                             push RBX;
257                             mov R8, %0;
258     ?1                        mov R9, %1;
259                             mov RBX, 0;
260                             mov RCX, 0;
261                             mov RAX, 0;
262                             mov RDX, 0;
263                             lock; cmpxchg16b [R8];
264     ?1                        mov [R9], RAX;
265     ?1                        mov 8[R9], RDX;
266                             pop RBX;
267                             ret;
268                         }
269                     }, [SrcPtr, RetPtr]));
270                 }
271                 else
272                 {
273                     asm pure nothrow @nogc @trusted
274                     {
275                         naked;
276                         push RBX;
277                         mov RBX, 0;
278                         mov RCX, 0;
279                         mov RAX, 0;
280                         mov RDX, 0;
281                         lock; cmpxchg16b [RDI];
282                         pop RBX;
283                         ret;
284                     }
285                 }
286             }
287         }
288         else static if (needsLoadBarrier!order)
289         {
290             version (D_InlineAsm_X86)
291             {
292                 enum SrcReg = SizedReg!CX;
293                 enum ZeroReg = SizedReg!(DX, T);
294                 enum ResReg = SizedReg!(AX, T);
295 
296                 mixin (simpleFormat(q{
297                     asm pure nothrow @nogc @trusted
298                     {
299                         mov %1, 0;
300                         mov %2, 0;
301                         mov %0, src;
302                         lock; cmpxchg [%0], %1;
303                     }
304                 }, [SrcReg, ZeroReg, ResReg]));
305             }
306             else version (D_InlineAsm_X86_64)
307             {
308                 version (Windows)
309                     enum SrcReg = SizedReg!CX;
310                 else
311                     enum SrcReg = SizedReg!DI;
312                 enum ZeroReg = SizedReg!(DX, T);
313                 enum ResReg = SizedReg!(AX, T);
314 
315                 mixin (simpleFormat(q{
316                     asm pure nothrow @nogc @trusted
317                     {
318                         naked;
319                         mov %1, 0;
320                         mov %2, 0;
321                         lock; cmpxchg [%0], %1;
322                         ret;
323                     }
324                 }, [SrcReg, ZeroReg, ResReg]));
325             }
326         }
327         else
328             return *src;
329     }
330 
331     void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
332         if (CanCAS!T)
333     {
334         static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
335 
336         static if (T.sizeof == size_t.sizeof * 2)
337         {
338             version (D_InlineAsm_X86)
339             {
340                 asm pure nothrow @nogc @trusted
341                 {
342                     push EDI;
343                     push EBX;
344                     lea EDI, value;
345                     mov EBX, [EDI];
346                     mov ECX, 4[EDI];
347                     mov EDI, dest;
348                     mov EAX, [EDI];
349                     mov EDX, 4[EDI];
350                 L1: lock; cmpxchg8b [EDI];
351                     jne L1;
352                     pop EBX;
353                     pop EDI;
354                 }
355             }
356             else version (D_InlineAsm_X86_64)
357             {
358                 version (Windows)
359                 {
360                     asm pure nothrow @nogc @trusted
361                     {
362                         naked;
363                         push RBX;
364                         mov R8, RDX;
365                         mov RAX, [RDX];
366                         mov RDX, 8[RDX];
367                         mov RBX, [RCX];
368                         mov RCX, 8[RCX];
369                     L1: lock; cmpxchg16b [R8];
370                         jne L1;
371                         pop RBX;
372                         ret;
373                     }
374                 }
375                 else
376                 {
377                     asm pure nothrow @nogc @trusted
378                     {
379                         naked;
380                         push RBX;
381                         mov RBX, RDI;
382                         mov RCX, RSI;
383                         mov RDI, RDX;
384                         mov RAX, [RDX];
385                         mov RDX, 8[RDX];
386                     L1: lock; cmpxchg16b [RDI];
387                         jne L1;
388                         pop RBX;
389                         ret;
390                     }
391                 }
392             }
393         }
394         else static if (needsStoreBarrier!order)
395             atomicExchange!(order, false)(dest, value);
396         else
397             *dest = value;
398     }
399 
400     T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
401         if (is(T : ulong))
402     {
403         version (D_InlineAsm_X86)
404         {
405             static assert(T.sizeof <= 4, "64bit atomicFetchAdd not supported on 32bit target." );
406 
407             enum DestReg = SizedReg!DX;
408             enum ValReg = SizedReg!(AX, T);
409 
410             mixin (simpleFormat(q{
411                 asm pure nothrow @nogc @trusted
412                 {
413                     mov %1, value;
414                     mov %0, dest;
415                     lock; xadd[%0], %1;
416                 }
417             }, [DestReg, ValReg]));
418         }
419         else version (D_InlineAsm_X86_64)
420         {
421             version (Windows)
422             {
423                 enum DestReg = SizedReg!DX;
424                 enum ValReg = SizedReg!(CX, T);
425             }
426             else
427             {
428                 enum DestReg = SizedReg!SI;
429                 enum ValReg = SizedReg!(DI, T);
430             }
431             enum ResReg = result ? SizedReg!(AX, T) : null;
432 
433             mixin (simpleFormat(q{
434                 asm pure nothrow @nogc @trusted
435                 {
436                     naked;
437                     lock; xadd[%0], %1;
438     ?2                mov %2, %1;
439                     ret;
440                 }
441             }, [DestReg, ValReg, ResReg]));
442         }
443         else
444             static assert (false, "Unsupported architecture.");
445     }
446 
447     T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
448         if (is(T : ulong))
449     {
450         return atomicFetchAdd(dest, cast(T)-cast(IntOrLong!T)value);
451     }
452 
453     T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
454     if (CanCAS!T)
455     {
456         version (D_InlineAsm_X86)
457         {
458             static assert(T.sizeof <= 4, "64bit atomicExchange not supported on 32bit target." );
459 
460             enum DestReg = SizedReg!CX;
461             enum ValReg = SizedReg!(AX, T);
462 
463             mixin (simpleFormat(q{
464                 asm pure nothrow @nogc @trusted
465                 {
466                     mov %1, value;
467                     mov %0, dest;
468                     xchg [%0], %1;
469                 }
470             }, [DestReg, ValReg]));
471         }
472         else version (D_InlineAsm_X86_64)
473         {
474             version (Windows)
475             {
476                 enum DestReg = SizedReg!DX;
477                 enum ValReg = SizedReg!(CX, T);
478             }
479             else
480             {
481                 enum DestReg = SizedReg!SI;
482                 enum ValReg = SizedReg!(DI, T);
483             }
484             enum ResReg = result ? SizedReg!(AX, T) : null;
485 
486             mixin (simpleFormat(q{
487                 asm pure nothrow @nogc @trusted
488                 {
489                     naked;
490                     xchg [%0], %1;
491     ?2                mov %2, %1;
492                     ret;
493                 }
494             }, [DestReg, ValReg, ResReg]));
495         }
496         else
497             static assert (false, "Unsupported architecture.");
498     }
499 
500     alias atomicCompareExchangeWeak = atomicCompareExchangeStrong;
501 
502     bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
503         if (CanCAS!T)
504     {
505         version (D_InlineAsm_X86)
506         {
507             static if (T.sizeof <= 4)
508             {
509                 enum DestAddr = SizedReg!CX;
510                 enum CmpAddr = SizedReg!DI;
511                 enum Val = SizedReg!(DX, T);
512                 enum Cmp = SizedReg!(AX, T);
513 
514                 mixin (simpleFormat(q{
515                     asm pure nothrow @nogc @trusted
516                     {
517                         push %1;
518                         mov %2, value;
519                         mov %1, compare;
520                         mov %3, [%1];
521                         mov %0, dest;
522                         lock; cmpxchg [%0], %2;
523                         mov [%1], %3;
524                         setz AL;
525                         pop %1;
526                     }
527                 }, [DestAddr, CmpAddr, Val, Cmp]));
528             }
529             else static if (T.sizeof == 8)
530             {
531                 asm pure nothrow @nogc @trusted
532                 {
533                     push EDI;
534                     push EBX;
535                     lea EDI, value;
536                     mov EBX, [EDI];
537                     mov ECX, 4[EDI];
538                     mov EDI, compare;
539                     mov EAX, [EDI];
540                     mov EDX, 4[EDI];
541                     mov EDI, dest;
542                     lock; cmpxchg8b [EDI];
543                     mov EDI, compare;
544                     mov [EDI], EAX;
545                     mov 4[EDI], EDX;
546                     setz AL;
547                     pop EBX;
548                     pop EDI;
549                 }
550             }
551             else
552                 static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
553         }
554         else version (D_InlineAsm_X86_64)
555         {
556             static if (T.sizeof <= 8)
557             {
558                 version (Windows)
559                 {
560                     enum DestAddr = SizedReg!R8;
561                     enum CmpAddr = SizedReg!DX;
562                     enum Val = SizedReg!(CX, T);
563                 }
564                 else
565                 {
566                     enum DestAddr = SizedReg!DX;
567                     enum CmpAddr = SizedReg!SI;
568                     enum Val = SizedReg!(DI, T);
569                 }
570                 enum Res = SizedReg!(AX, T);
571 
572                 mixin (simpleFormat(q{
573                     asm pure nothrow @nogc @trusted
574                     {
575                         naked;
576                         mov %3, [%1];
577                         lock; cmpxchg [%0], %2;
578                         jne compare_fail;
579                         mov AL, 1;
580                         ret;
581                     compare_fail:
582                         mov [%1], %3;
583                         xor AL, AL;
584                         ret;
585                     }
586                 }, [DestAddr, CmpAddr, Val, Res]));
587             }
588             else
589             {
590                 version (Windows)
591                 {
592                     asm pure nothrow @nogc @trusted
593                     {
594                         naked;
595                         push RBX;
596                         mov R9, RDX;
597                         mov RAX, [RDX];
598                         mov RDX, 8[RDX];
599                         mov RBX, [RCX];
600                         mov RCX, 8[RCX];
601                         lock; cmpxchg16b [R8];
602                         pop RBX;
603                         jne compare_fail;
604                         mov AL, 1;
605                         ret;
606                     compare_fail:
607                         mov [R9], RAX;
608                         mov 8[R9], RDX;
609                         xor AL, AL;
610                         ret;
611                     }
612                 }
613                 else
614                 {
615                     asm pure nothrow @nogc @trusted
616                     {
617                         naked;
618                         push RBX;
619                         mov R8, RCX;
620                         mov R9, RDX;
621                         mov RAX, [RDX];
622                         mov RDX, 8[RDX];
623                         mov RBX, RDI;
624                         mov RCX, RSI;
625                         lock; cmpxchg16b [R8];
626                         pop RBX;
627                         jne compare_fail;
628                         mov AL, 1;
629                         ret;
630                     compare_fail:
631                         mov [R9], RAX;
632                         mov 8[R9], RDX;
633                         xor AL, AL;
634                         ret;
635                     }
636                 }
637             }
638         }
639         else
640             static assert (false, "Unsupported architecture.");
641     }
642 
643     alias atomicCompareExchangeWeakNoResult = atomicCompareExchangeStrongNoResult;
644 
645     bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
646         if (CanCAS!T)
647     {
648         version (D_InlineAsm_X86)
649         {
650             static if (T.sizeof <= 4)
651             {
652                 enum DestAddr = SizedReg!CX;
653                 enum Cmp = SizedReg!(AX, T);
654                 enum Val = SizedReg!(DX, T);
655 
656                 mixin (simpleFormat(q{
657                     asm pure nothrow @nogc @trusted
658                     {
659                         mov %2, value;
660                         mov %1, compare;
661                         mov %0, dest;
662                         lock; cmpxchg [%0], %2;
663                         setz AL;
664                     }
665                 }, [DestAddr, Cmp, Val]));
666             }
667             else static if (T.sizeof == 8)
668             {
669                 asm pure nothrow @nogc @trusted
670                 {
671                     push EDI;
672                     push EBX;
673                     lea EDI, value;
674                     mov EBX, [EDI];
675                     mov ECX, 4[EDI];
676                     lea EDI, compare;
677                     mov EAX, [EDI];
678                     mov EDX, 4[EDI];
679                     mov EDI, dest;
680                     lock; cmpxchg8b [EDI];
681                     setz AL;
682                     pop EBX;
683                     pop EDI;
684                 }
685             }
686             else
687                 static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
688         }
689         else version (D_InlineAsm_X86_64)
690         {
691             static if (T.sizeof <= 8)
692             {
693                 version (Windows)
694                 {
695                     enum DestAddr = SizedReg!R8;
696                     enum Cmp = SizedReg!(DX, T);
697                     enum Val = SizedReg!(CX, T);
698                 }
699                 else
700                 {
701                     enum DestAddr = SizedReg!DX;
702                     enum Cmp = SizedReg!(SI, T);
703                     enum Val = SizedReg!(DI, T);
704                 }
705                 enum AXReg = SizedReg!(AX, T);
706 
707                 mixin (simpleFormat(q{
708                     asm pure nothrow @nogc @trusted
709                     {
710                         naked;
711                         mov %3, %1;
712                         lock; cmpxchg [%0], %2;
713                         setz AL;
714                         ret;
715                     }
716                 }, [DestAddr, Cmp, Val, AXReg]));
717             }
718             else
719             {
720                 version (Windows)
721                 {
722                     asm pure nothrow @nogc @trusted
723                     {
724                         naked;
725                         push RBX;
726                         mov RAX, [RDX];
727                         mov RDX, 8[RDX];
728                         mov RBX, [RCX];
729                         mov RCX, 8[RCX];
730                         lock; cmpxchg16b [R8];
731                         setz AL;
732                         pop RBX;
733                         ret;
734                     }
735                 }
736                 else
737                 {
738                     asm pure nothrow @nogc @trusted
739                     {
740                         naked;
741                         push RBX;
742                         mov RAX, RDX;
743                         mov RDX, RCX;
744                         mov RBX, RDI;
745                         mov RCX, RSI;
746                         lock; cmpxchg16b [R8];
747                         setz AL;
748                         pop RBX;
749                         ret;
750                     }
751                 }
752             }
753         }
754         else
755             static assert (false, "Unsupported architecture.");
756     }
757 
758     void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
759     {
760         // TODO: `mfence` should only be required for seq_cst operations, but this depends on
761         //       the compiler's backend knowledge to not reorder code inappropriately,
762         //       so we'll apply it conservatively.
763         static if (order != MemoryOrder.raw)
764         {
765             version (D_InlineAsm_X86)
766             {
767                 import core.cpuid;
768 
769                 // TODO: review this implementation; it seems way overly complicated
770                 asm pure nothrow @nogc @trusted
771                 {
772                     naked;
773 
774                     call sse2;
775                     test AL, AL;
776                     jne Lcpuid;
777 
778                     // Fast path: We have SSE2, so just use mfence.
779                     mfence;
780                     jmp Lend;
781 
782                 Lcpuid:
783 
784                     // Slow path: We use cpuid to serialize. This is
785                     // significantly slower than mfence, but is the
786                     // only serialization facility we have available
787                     // on older non-SSE2 chips.
788                     push EBX;
789 
790                     mov EAX, 0;
791                     cpuid;
792 
793                     pop EBX;
794 
795                 Lend:
796 
797                     ret;
798                 }
799             }
800             else version (D_InlineAsm_X86_64)
801             {
802                 asm pure nothrow @nogc @trusted
803                 {
804                     naked;
805                     mfence;
806                     ret;
807                 }
808             }
809             else
810                 static assert (false, "Unsupported architecture.");
811         }
812     }
813 
814     void pause() pure nothrow @nogc @trusted
815     {
816         version (D_InlineAsm_X86)
817         {
818             asm pure nothrow @nogc @trusted
819             {
820                 naked;
821                 rep; nop;
822                 ret;
823             }
824         }
825         else version (D_InlineAsm_X86_64)
826         {
827             asm pure nothrow @nogc @trusted
828             {
829                 naked;
830     //            pause; // TODO: DMD should add this opcode to its inline asm
831                 rep; nop;
832                 ret;
833             }
834         }
835         else
836         {
837             // ARM should `yield`
838             // other architectures? otherwise some sort of nop...
839         }
840     }
841 }
842 else version (GNU)
843 {
844     import gcc.builtins;
845     import gcc.config;
846 
847     inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
848         if (CanCAS!T)
849     {
850         static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
851 
852         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
853         {
854             static if (T.sizeof == ubyte.sizeof)
855             {
856                 ubyte value = __atomic_load_1(cast(shared)src, order);
857                 return *cast(typeof(return)*)&value;
858             }
859             else static if (T.sizeof == ushort.sizeof)
860             {
861                 ushort value = __atomic_load_2(cast(shared)src, order);
862                 return *cast(typeof(return)*)&value;
863             }
864             else static if (T.sizeof == uint.sizeof)
865             {
866                 uint value = __atomic_load_4(cast(shared)src, order);
867                 return *cast(typeof(return)*)&value;
868             }
869             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
870             {
871                 ulong value = __atomic_load_8(cast(shared)src, order);
872                 return *cast(typeof(return)*)&value;
873             }
874             else static if (GNU_Have_LibAtomic)
875             {
876                 T value;
877                 __atomic_load(T.sizeof, cast(shared)src, &value, order);
878                 return *cast(typeof(return)*)&value;
879             }
880             else
881                 static assert(0, "Invalid template type specified.");
882         }
883         else
884         {
885             getAtomicMutex.lock();
886             scope(exit) getAtomicMutex.unlock();
887             return *cast(typeof(return)*)&src;
888         }
889     }
890 
891     void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
892         if (CanCAS!T)
893     {
894         static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
895 
896         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
897         {
898             static if (T.sizeof == ubyte.sizeof)
899                 __atomic_store_1(cast(shared)dest, *cast(ubyte*)&value, order);
900             else static if (T.sizeof == ushort.sizeof)
901                 __atomic_store_2(cast(shared)dest, *cast(ushort*)&value, order);
902             else static if (T.sizeof == uint.sizeof)
903                 __atomic_store_4(cast(shared)dest, *cast(uint*)&value, order);
904             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
905                 __atomic_store_8(cast(shared)dest, *cast(ulong*)&value, order);
906             else static if (GNU_Have_LibAtomic)
907                 __atomic_store(T.sizeof, cast(shared)dest, cast(void*)&value, order);
908             else
909                 static assert(0, "Invalid template type specified.");
910         }
911         else
912         {
913             getAtomicMutex.lock();
914             *dest = value;
915             getAtomicMutex.unlock();
916         }
917     }
918 
919     T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
920         if (is(T : ulong))
921     {
922         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
923         {
924             static if (T.sizeof == ubyte.sizeof)
925                 return __atomic_fetch_add_1(cast(shared)dest, value, order);
926             else static if (T.sizeof == ushort.sizeof)
927                 return __atomic_fetch_add_2(cast(shared)dest, value, order);
928             else static if (T.sizeof == uint.sizeof)
929                 return __atomic_fetch_add_4(cast(shared)dest, value, order);
930             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
931                 return __atomic_fetch_add_8(cast(shared)dest, value, order);
932             else static if (GNU_Have_LibAtomic)
933                 return __atomic_fetch_add(T.sizeof, cast(shared)dest, cast(void*)&value, order);
934             else
935                 static assert(0, "Invalid template type specified.");
936         }
937         else
938         {
939             getAtomicMutex.lock();
940             scope(exit) getAtomicMutex.unlock();
941             T tmp = *dest;
942             *dest += value;
943             return tmp;
944         }
945     }
946 
947     T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
948         if (is(T : ulong))
949     {
950         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
951         {
952             static if (T.sizeof == ubyte.sizeof)
953                 return __atomic_fetch_sub_1(cast(shared)dest, value, order);
954             else static if (T.sizeof == ushort.sizeof)
955                 return __atomic_fetch_sub_2(cast(shared)dest, value, order);
956             else static if (T.sizeof == uint.sizeof)
957                 return __atomic_fetch_sub_4(cast(shared)dest, value, order);
958             else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
959                 return __atomic_fetch_sub_8(cast(shared)dest, value, order);
960             else static if (GNU_Have_LibAtomic)
961                 return __atomic_fetch_sub(T.sizeof, cast(shared)dest, cast(void*)&value, order);
962             else
963                 static assert(0, "Invalid template type specified.");
964         }
965         else
966         {
967             getAtomicMutex.lock();
968             scope(exit) getAtomicMutex.unlock();
969             T tmp = *dest;
970             *dest -= value;
971             return tmp;
972         }
973     }
974 
975     T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
976         if (is(T : ulong) || is(T == class) || is(T == interface) || is(T U : U*))
977     {
978         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
979         {
980             static if (T.sizeof == byte.sizeof)
981             {
982                 ubyte res = __atomic_exchange_1(cast(shared)dest, *cast(ubyte*)&value, order);
983                 return *cast(typeof(return)*)&res;
984             }
985             else static if (T.sizeof == short.sizeof)
986             {
987                 ushort res = __atomic_exchange_2(cast(shared)dest, *cast(ushort*)&value, order);
988                 return *cast(typeof(return)*)&res;
989             }
990             else static if (T.sizeof == int.sizeof)
991             {
992                 uint res = __atomic_exchange_4(cast(shared)dest, *cast(uint*)&value, order);
993                 return *cast(typeof(return)*)&res;
994             }
995             else static if (T.sizeof == long.sizeof && GNU_Have_64Bit_Atomics)
996             {
997                 ulong res = __atomic_exchange_8(cast(shared)dest, *cast(ulong*)&value, order);
998                 return *cast(typeof(return)*)&res;
999             }
1000             else static if (GNU_Have_LibAtomic)
1001             {
1002                 T res = void;
1003                 __atomic_exchange(T.sizeof, cast(shared)dest, cast(void*)&value, &res, order);
1004                 return res;
1005             }
1006             else
1007                 static assert(0, "Invalid template type specified.");
1008         }
1009         else
1010         {
1011             getAtomicMutex.lock();
1012             scope(exit) getAtomicMutex.unlock();
1013 
1014             T res = *dest;
1015             *dest = value;
1016             return res;
1017         }
1018     }
1019 
1020     bool atomicCompareExchangeWeak(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
1021         if (CanCAS!T)
1022     {
1023         return atomicCompareExchangeImpl!(succ, fail, true)(dest, compare, value);
1024     }
1025 
1026     bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
1027         if (CanCAS!T)
1028     {
1029         return atomicCompareExchangeImpl!(succ, fail, false)(dest, compare, value);
1030     }
1031 
1032     bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
1033         if (CanCAS!T)
1034     {
1035         return atomicCompareExchangeImpl!(succ, fail, false)(dest, cast(T*)&compare, value);
1036     }
1037 
1038     bool atomicCompareExchangeWeakNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
1039         if (CanCAS!T)
1040     {
1041         return atomicCompareExchangeImpl!(succ, fail, true)(dest, cast(T*)&compare, value);
1042     }
1043 
1044     private bool atomicCompareExchangeImpl(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, bool weak, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
1045         if (CanCAS!T)
1046     {
1047         bool res = void;
1048 
1049         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
1050         {
1051             static if (T.sizeof == byte.sizeof)
1052                 res = __atomic_compare_exchange_1(cast(shared)dest, compare, *cast(ubyte*)&value,
1053                                                   weak, succ, fail);
1054             else static if (T.sizeof == short.sizeof)
1055                 res = __atomic_compare_exchange_2(cast(shared)dest, compare, *cast(ushort*)&value,
1056                                                   weak, succ, fail);
1057             else static if (T.sizeof == int.sizeof)
1058                 res = __atomic_compare_exchange_4(cast(shared)dest, compare, *cast(uint*)&value,
1059                                                   weak, succ, fail);
1060             else static if (T.sizeof == long.sizeof && GNU_Have_64Bit_Atomics)
1061                 res = __atomic_compare_exchange_8(cast(shared)dest, compare, *cast(ulong*)&value,
1062                                                   weak, succ, fail);
1063             else static if (GNU_Have_LibAtomic)
1064                 res = __atomic_compare_exchange(T.sizeof, cast(shared)dest, compare, cast(void*)&value,
1065                                                 succ, fail);
1066             else
1067                 static assert(0, "Invalid template type specified.");
1068         }
1069         else
1070         {
1071             static if (T.sizeof == byte.sizeof)
1072                 alias U = byte;
1073             else static if (T.sizeof == short.sizeof)
1074                 alias U = short;
1075             else static if (T.sizeof == int.sizeof)
1076                 alias U = int;
1077             else static if (T.sizeof == long.sizeof)
1078                 alias U = long;
1079             else
1080                 static assert(0, "Invalid template type specified.");
1081 
1082             getAtomicMutex.lock();
1083             scope(exit) getAtomicMutex.unlock();
1084 
1085             if (*cast(U*)dest == *cast(U*)&compare)
1086             {
1087                 *dest = value;
1088                 res = true;
1089             }
1090             else
1091             {
1092                 *compare = *dest;
1093                 res = false;
1094             }
1095         }
1096 
1097         return res;
1098     }
1099 
1100     void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
1101     {
1102         static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
1103             __atomic_thread_fence(order);
1104         else
1105         {
1106             getAtomicMutex.lock();
1107             getAtomicMutex.unlock();
1108         }
1109     }
1110 
1111     void pause() pure nothrow @nogc @trusted
1112     {
1113         version (X86)
1114         {
1115             __builtin_ia32_pause();
1116         }
1117         else version (X86_64)
1118         {
1119             __builtin_ia32_pause();
1120         }
1121         else
1122         {
1123             // Other architectures? Some sort of nop or barrier.
1124         }
1125     }
1126 
1127     static if (!GNU_Have_Atomics && !GNU_Have_LibAtomic)
1128     {
1129         // Use system mutex for atomics, faking the purity of the functions so
1130         // that they can be used in pure/nothrow/@safe code.
1131         extern (C) private pure @trusted @nogc nothrow
1132         {
1133             static if (GNU_Thread_Model == ThreadModel.Posix)
1134             {
1135                 import core.sys.posix.pthread;
1136                 alias atomicMutexHandle = pthread_mutex_t;
1137 
1138                 pragma(mangle, "pthread_mutex_init") int fakePureMutexInit(pthread_mutex_t*, pthread_mutexattr_t*);
1139                 pragma(mangle, "pthread_mutex_lock") int fakePureMutexLock(pthread_mutex_t*);
1140                 pragma(mangle, "pthread_mutex_unlock") int fakePureMutexUnlock(pthread_mutex_t*);
1141             }
1142             else static if (GNU_Thread_Model == ThreadModel.Win32)
1143             {
1144                 import core.sys.windows.winbase;
1145                 alias atomicMutexHandle = CRITICAL_SECTION;
1146 
1147                 pragma(mangle, "InitializeCriticalSection") int fakePureMutexInit(CRITICAL_SECTION*);
1148                 pragma(mangle, "EnterCriticalSection") void fakePureMutexLock(CRITICAL_SECTION*);
1149                 pragma(mangle, "LeaveCriticalSection") int fakePureMutexUnlock(CRITICAL_SECTION*);
1150             }
1151             else
1152             {
1153                 alias atomicMutexHandle = int;
1154             }
1155         }
1156 
1157         // Implements lock/unlock operations.
1158         private struct AtomicMutex
1159         {
1160             int lock() pure @trusted @nogc nothrow
1161             {
1162                 static if (GNU_Thread_Model == ThreadModel.Posix)
1163                 {
1164                     if (!_inited)
1165                     {
1166                         fakePureMutexInit(&_handle, null);
1167                         _inited = true;
1168                     }
1169                     return fakePureMutexLock(&_handle);
1170                 }
1171                 else
1172                 {
1173                     static if (GNU_Thread_Model == ThreadModel.Win32)
1174                     {
1175                         if (!_inited)
1176                         {
1177                             fakePureMutexInit(&_handle);
1178                             _inited = true;
1179                         }
1180                         fakePureMutexLock(&_handle);
1181                     }
1182                     return 0;
1183                 }
1184             }
1185 
1186             int unlock() pure @trusted @nogc nothrow
1187             {
1188                 static if (GNU_Thread_Model == ThreadModel.Posix)
1189                     return fakePureMutexUnlock(&_handle);
1190                 else
1191                 {
1192                     static if (GNU_Thread_Model == ThreadModel.Win32)
1193                         fakePureMutexUnlock(&_handle);
1194                     return 0;
1195                 }
1196             }
1197 
1198         private:
1199             atomicMutexHandle _handle;
1200             bool _inited;
1201         }
1202 
1203         // Internal static mutex reference.
1204         private AtomicMutex* _getAtomicMutex() @trusted @nogc nothrow
1205         {
1206             __gshared static AtomicMutex mutex;
1207             return &mutex;
1208         }
1209 
1210         // Pure alias for _getAtomicMutex.
1211         pragma(mangle, _getAtomicMutex.mangleof)
1212         private AtomicMutex* getAtomicMutex() pure @trusted @nogc nothrow @property;
1213     }
1214 }
1215 
1216 private:
1217 
1218 version (Windows)
1219 {
1220     enum RegisterReturn(T) = is(T : U[], U) || is(T : R delegate(A), R, A...);
1221 }
1222 
1223 enum CanCAS(T) = is(T : ulong) ||
1224                  is(T == class) ||
1225                  is(T == interface) ||
1226                  is(T : U*, U) ||
1227                  is(T : U[], U) ||
1228                  is(T : R delegate(A), R, A...) ||
1229                  (is(T == struct) && __traits(isPOD, T) &&
1230                   (T.sizeof <= size_t.sizeof*2 ||       // no more than 2 words
1231                    (T.sizeof == 16 && has128BitCAS)) && // or supports 128-bit CAS
1232                   (T.sizeof & (T.sizeof - 1)) == 0      // is power of 2
1233                  );
1234 
1235 template IntOrLong(T)
1236 {
1237     static if (T.sizeof > 4)
1238         alias IntOrLong = long;
1239     else
1240         alias IntOrLong = int;
1241 }
1242 
1243 // NOTE: x86 loads implicitly have acquire semantics so a memory
1244 //       barrier is only necessary on releases.
1245 template needsLoadBarrier( MemoryOrder ms )
1246 {
1247     enum bool needsLoadBarrier = ms == MemoryOrder.seq;
1248 }
1249 
1250 
1251 // NOTE: x86 stores implicitly have release semantics so a memory
1252 //       barrier is only necessary on acquires.
1253 template needsStoreBarrier( MemoryOrder ms )
1254 {
1255     enum bool needsStoreBarrier = ms == MemoryOrder.seq;
1256 }
1257 
1258 // this is a helper to build asm blocks
1259 string simpleFormat(string format, scope string[] args)
1260 {
1261     string result;
1262     outer: while (format.length)
1263     {
1264         foreach (i; 0 .. format.length)
1265         {
1266             if (format[i] == '%' || format[i] == '?')
1267             {
1268                 bool isQ = format[i] == '?';
1269                 result ~= format[0 .. i++];
1270                 assert (i < format.length, "Invalid format string");
1271                 if (format[i] == '%' || format[i] == '?')
1272                 {
1273                     assert(!isQ, "Invalid format string");
1274                     result ~= format[i++];
1275                 }
1276                 else
1277                 {
1278                     int index = 0;
1279                     assert (format[i] >= '0' && format[i] <= '9', "Invalid format string");
1280                     while (i < format.length && format[i] >= '0' && format[i] <= '9')
1281                         index = index * 10 + (ubyte(format[i++]) - ubyte('0'));
1282                     if (!isQ)
1283                         result ~= args[index];
1284                     else if (!args[index])
1285                     {
1286                         size_t j = i;
1287                         for (; j < format.length;)
1288                         {
1289                             if (format[j++] == '\n')
1290                                 break;
1291                         }
1292                         i = j;
1293                     }
1294                 }
1295                 format = format[i .. $];
1296                 continue outer;
1297             }
1298         }
1299         result ~= format;
1300         break;
1301     }
1302     return result;
1303 }