1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
CMPXCHG16B no longer produces optimal code
This /used/ to produce optimal code before SRA was introduced, and maybe before some enums were rearranged.
Test case:
```asm
%ifdef CONFIG
{
}
%endif
mov rsp, 0xe0001000
cmpxchg16b [rsp]
hlt
```
ASM for cmpxchg16b
```asm
0x00007f3ec2e80060 10ffffe0 adr x0, #-0x4 (addr 0x7f3ec2e8005c)
0x00007f3ec2e80064 f9005f80 str x0, [x28, #184]
; Moving SRA registers RDX:RAX in to element pair (expected).
0x00007f3ec2e80068 aa0403f4 mov x20, x4
0x00007f3ec2e8006c aa0603f5 mov x21, x6
; Moving SRA registers RCX:RBX in to element pair (Desired).
0x00007f3ec2e80070 aa0703f6 mov x22, x7
0x00007f3ec2e80074 aa0503f7 mov x23, x5
; Moving *expected* in to temporaries. Because the ARM instruction overwrites these . (In IR Op)
; We don't detect when Dst == Expected, nor have constraints to enforce it.
0x00007f3ec2e80078 aa1403e2 mov x2, x20
0x00007f3ec2e8007c aa1503e3 mov x3, x21
; Rock the caspal
0x00007f3ec2e80080 4862fd16 caspal x2, x3, x22, x23, [x8]
; Move the result back in to destination (In IR Op)
0x00007f3ec2e80084 aa0203f4 mov x20, x2
0x00007f3ec2e80088 aa0303f5 mov x21, x3
; ExtractElementPair(0) - Ugh RA.
0x00007f3ec2e8008c aa1403f6 mov x22, x20
; ExtractElementPair(1) - Ugh RA.
0x00007f3ec2e80090 aa1503f4 mov x20, x21
; Calculate if Memory == Expected
0x00007f3ec2e80094 ca0402d5 eor x21, x22, x4
0x00007f3ec2e80098 ca060297 eor x23, x20, x6
0x00007f3ec2e8009c aa1702b5 orr x21, x21, x23
0x00007f3ec2e800a0 f10002bf cmp x21, #0x0 (0)
0x00007f3ec2e800a4 9a9f17f7 cset x23, eq
; Set ZF to result
0x00007f3ec2e800a8 390b1b97 strb w23, [x28, #710]
; Skip block if ZF was expected.
0x00007f3ec2e800ac b4000075 cbz x21, #+0xc (addr 0x7f3ec2e800b8)
; If ZF wasn't expected, RDX:RAX gets set to the memory that was loaded.
; Need to be careful here, CMPXCHG8B must /not/ clear the upper bits if memory was expected.
; Could generate more optimal code if we know the upper bits are already zero, operating in 32-bit mode, or 16B since upper bits will always be set
0x00007f3ec2e800b0 aa1603e4 mov x4, x22
0x00007f3ec2e800b4 aa1403e6 mov x6, x20
; Tail block.
0x00007f3ec2e800b8 58000040 ldr x0, pc+8 (addr 0x7f3ec2e800c0)
0x00007f3ec2e800bc d63f0000 blr x0
0x00007f3ec2e800c0 d8d7f128 prfm plil1keep, pc-328156 (addr 0x7f3ec2e2fee4)
0x00007f3ec2e800c4 00007f3e udf #0x7f3e
0x00007f3ec2e800c8 0001000a unallocated (Unallocated)
0x00007f3ec2e800cc 00000000 udf #0x0
```
|