逆向工程 - 如何改进SSE指令的Ghidra反编译？ - 吾爱随笔录

在一些可执行文件中的一些标准库函数上练习 Ghidra 时，我通常无法在使用 SSE 寄存器作为优化的情况下获得良好的反编译输出。我试过在这里和互联网的其他地方搜索各种关键字组合，但找不到任何相关的内容。

下面的例子来自没有调试符号的 x64 代码，所以函数/结构/变量名称是我做的。类型选择并没有改变/改善所展示的问题 - 但也许我只是做错了。我还尝试保留足够的上下文以提高可读性，但我找不到该站点在这方面的任何指导方针。

当std::string使用两个 128 位 SSE 移动复制32 字节（作为移动构造的一部分）时，整个内容被分解为 4 字节块：

我的std::string完整性数据类型定义（导出）：

struct std_string {
  char * data;
  char * field_1;
  ulonglong size;
  ulonglong capacity;
};

拆卸：

                         **************************************************************
                         *                          FUNCTION                          *
                         **************************************************************
                         std_string * __fastcall std_string_operator+(std_string 
         std_string *      RAX:8          <RETURN>
         std_string *      RCX:8          thisOut
         undefined8        RDX:8          thisIn
         char *            R8:8           stringIn                                XREF[1]:     140106dcc(W)  
         longlong          R8:8           size                                    XREF[1]:     140106dcc(W)  
         undefined8        RAX:8          thisIn_                                 XREF[1]:     140106de6(W)  
                         std_string_operator+
   140106dc0 40 53           PUSH       RBX
   140106dc2 48 83 ec 20     SUB        RSP,0x20
   140106dc6 49 8b c0        MOV        RAX,stringIn
   140106dc9 4c 8b ca        MOV        R9,thisIn
   140106dcc 49 c7 c0        MOV        size,-0x1
             ff ff ff ff
   140106dd3 48 8b d9        MOV        RBX,thisOut
                         LAB_140106dd6                                   XREF[1]:     140106dde(j)  
   140106dd6 49 ff c0        INC        size
   140106dd9 42 80 3c        CMP        byte ptr [RAX + size*0x1],0x0
             00 00
   140106dde 75 f6           JNZ        LAB_140106dd6
   140106de0 48 8b d0        MOV        thisIn,RAX
   140106de3 49 8b c9        MOV        thisOut,R9
   140106de6 e8 75 fe        CALL       std_string_append                                std_string * std_string_append(s
             ff ff
   140106deb 33 c9           XOR        thisOut,thisOut
   140106ded 48 89 4b 10     MOV        qword ptr [RBX + 0x10],thisOut
   140106df1 48 89 4b 18     MOV        qword ptr [RBX + 0x18],thisOut
   140106df5 0f 10 00        MOVUPS     XMM0,xmmword ptr [thisIn_]
   140106df8 0f 11 03        MOVUPS     xmmword ptr [RBX],XMM0
   140106dfb 0f 10 48 10     MOVUPS     XMM1,xmmword ptr [thisIn_ + 0x10]
   140106dff 0f 11 4b 10     MOVUPS     xmmword ptr [RBX + 0x10],XMM1
   140106e03 48 89 48 10     MOV        qword ptr [thisIn_ + 0x10],thisOut
   140106e07 48 c7 40        MOV        qword ptr [thisIn_ + 0x18],0xf
             18 0f 00 
             00 00
   140106e0f 88 08           MOV        byte ptr [thisIn_],thisOut
   140106e11 48 8b c3        MOV        thisIn_,RBX
   140106e14 48 83 c4 20     ADD        RSP,0x20
   140106e18 5b              POP        RBX
   140106e19 c3              RET

反编译：

std_string * std_string_operator+(std_string *thisOut,std_string *thisIn,char *stringIn)
{
  undefined4 uVar1;
  undefined4 uVar2;
  undefined4 uVar3;
  std_string *thisIn_;
  longlong size;

  size = -1;
  do {
    size = size + 1;
  } while (stringIn[size] != '\0');
  thisIn_ = std_string_append(thisIn,stringIn,size);
  thisOut->size = 0;
  thisOut->capacity = 0;
  uVar1 = *(undefined4 *)((longlong)&thisIn_->data + 4);
  uVar2 = *(undefined4 *)&thisIn_->field_1;
  uVar3 = *(undefined4 *)((longlong)&thisIn_->field_1 + 4);
  *(undefined4 *)&thisOut->data = *(undefined4 *)&thisIn_->data;
  *(undefined4 *)((longlong)&thisOut->data + 4) = uVar1;
  *(undefined4 *)&thisOut->field_1 = uVar2;
  *(undefined4 *)((longlong)&thisOut->field_1 + 4) = uVar3;
  uVar1 = *(undefined4 *)((longlong)&thisIn_->size + 4);
  uVar2 = *(undefined4 *)&thisIn_->capacity;
  uVar3 = *(undefined4 *)((longlong)&thisIn_->capacity + 4);
  *(undefined4 *)&thisOut->size = *(undefined4 *)&thisIn_->size;
  *(undefined4 *)((longlong)&thisOut->size + 4) = uVar1;
  *(undefined4 *)&thisOut->capacity = uVar2;
  *(undefined4 *)((longlong)&thisOut->capacity + 4) = uVar3;
  thisIn_->size = 0;
  thisIn_->capacity = 0xf;
  *(undefined *)&thisIn_->data = 0;
  return thisOut;
}

我会理解复制八个字节的四个字段，或者（以某种形式）表达两个 128 位或一个 256 位副本。我假设上面的四个字节块是 MOVUPS 在 Ghidra 中的编码方式，但它对我来说似乎一点帮助都没有。这种复制（通过 SSE）在任何地方都比较频繁地发生，所以每次都有 16 行噪音很烦人。

（另外，不要问我关于时髦的函数签名，我不知道编译器在做什么。）

PUNPCKLBW在 memset 中使用的A （这里只是XMM0通过重复每个较低的 8 个字节来填充，有效地广播单个字节以设置为的所有 16 个字节XMM0）爆炸成几十行胡言乱语（忠实地模拟了效果，我敢肯定，但这没有帮助）：

拆卸：

                     **************************************************************
                     *                          FUNCTION                          *
                     **************************************************************
                     longlong * __fastcall memset(void * location, byte byteT
     longlong *        RAX:8          <RETURN>
     void *            RCX:8          location
     byte              DL:1           byteToSet                               XREF[1]:     1411960a8(W)  
     ulonglong         R8:8           count
     undefined8        R9:8           inputByteRepeated8                      XREF[1]:     1411960a0(W)  
     undefined2        DX:2           inputByteRepeated2                      XREF[1]:     1411960a8(W)  
                     memset                                          XREF[518]:  [...]
 141196090 4c 8b d9        MOV        R11,location
 141196093 0f b6 d2        MOVZX      EDX,DL
 141196096 49 b9 01        MOV        R9,0x101010101010101
           01 01 01 
           01 01 01 01
 1411960a0 4c 0f af ca     IMUL       R9,RDX
 1411960a4 49 83 f8 10     CMP        R8,0x10
 1411960a8 0f 86 f2        JBE        LAB_1411961a0
           00 00 00
 1411960ae 66 49 0f        MOVQ       XMM0,R9
           6e c1
 1411960b3 66 0f 60 c0     PUNPCKLBW  XMM0,XMM0
                      [...]

反编译：

longlong * memset(void *location,byte byteToSet,ulonglong count)
{
  // [...]
  ushort inputByteRepeated2;
  ulonglong inputByteRepeated8;
  undefined4 uVar5;
  undefined4 uVar7;
  undefined4 uVar8;
  undefined auVar6 [13];

  inputByteRepeated8 = (ulonglong)byteToSet * 0x101010101010101;
  inputByteRepeated2 = (ushort)inputByteRepeated8;
  _inputByteRepeated2 = (uint)inputByteRepeated8;
  if (count < 0x11) {
    // [...]
  }
  auVar6[6] = SUB141(ZEXT814(inputByteRepeated8) >> 0x30,0);
  auVar6 = ZEXT813(inputByteRepeated8);
  register0x0000120c =
       SUB164(CONCAT313(SUB163(CONCAT214(SUB162(CONCAT115(SUB161(ZEXT816(inputByteRepeated8) >> 0x38
                                                                 ,0),
                                                          CONCAT114(SUB151(ZEXT815(
                                                  inputByteRepeated8) >> 0x38,0),
                                                  ZEXT814(inputByteRepeated8))) >> 0x70,0),
                                         CONCAT113(auVar6[6],auVar6)) >> 0x68,0),
                        CONCAT112(auVar6[6],ZEXT812(inputByteRepeated8))) >> 0x60,0);
  _auVar6 = CONCAT79(SUB167(CONCAT610(SUB166(CONCAT511(SUB165(CONCAT412(register0x0000120c,
                                                                        CONCAT111(SUB131(auVar6 >> 
                                                  0x28,0),ZEXT811(inputByteRepeated8))) >> 0x58,0),
                                                  CONCAT110(SUB121(ZEXT812(inputByteRepeated8) >>
                                                                   0x28,0),
                                                            (unkuint10)inputByteRepeated8)) >> 0x50,
                                             0),
                                      CONCAT19(SUB131(auVar6 >> 0x20,0),(unkuint9)inputByteRepeated8
                                              )) >> 0x48,0),
                     (unkuint9)inputByteRepeated8 & 0xffffffffffffffff | 0 << 0x40);
  register0x00001208 = SUB168(_auVar6 >> 0x40,0);
  _auVar6 = CONCAT115(SUB1611(CONCAT106(SUB1610(CONCAT97(SUB169(CONCAT88(register0x00001208,
                                                                         (inputByteRepeated8 >> 0x18
                                                                         ) << 0x38) >> 0x38,0),
                                                         ((uint7)inputByteRepeated8 >> 0x18) << 0x30
                                                        ) >> 0x30,0),
                                        ((uint6)inputByteRepeated8 >> 0x10) << 0x28) >> 0x28,0),
                      ((uint5)inputByteRepeated8 >> 0x10) << 0x20);
  _auVar6 = CONCAT142(SUB1614(CONCAT133(SUB1613(CONCAT124(SUB1612(_auVar6 >> 0x20,0),
                                                          (_inputByteRepeated2 >> 8) << 0x18) >>
                                                0x18,0),((uint3)inputByteRepeated8 >> 8) << 0x10) >>
                              0x10,0),inputByteRepeated2 & 0xff | inputByteRepeated2 << 8);
  uVar7 = SUB164(_auVar6 >> 0x20,0);
  uVar5 = SUB164(_auVar6,0);
  uVar8 = SUB164(_auVar6 >> 0x40,0);

  // [...]

剩下的反编译也使用这些笨拙的独立 4 字节块，只要代码只是执行 aMOVAPS xmmword ptr [location],XMM0或类似操作。结果我花了一段时间才认出这整个功能memcpy！

在这些情况下，如何使反编译的代码更具可读性（或首先避免混乱）？

^{我会用SSE它是否已经作为标签存在来标记这个问题，但我不愿意创建它，因为我还不够了解这个社区。当然：欢迎改进建议！}