我有一个将字符串(具有某种结构)作为输入的程序。我的目标是收集有关输入中每个字符在使用concolic execution解析时与哪些值进行比较的信息。正确的做法是什么?
我的主题是一个用于表达式的简单递归下降解析器,它接受表单1+(2*3)
等的输入。
到目前为止我尝试过的:
编写了一个使用字符串参数构造初始状态的程序,通过一次一个步骤执行它,最后收集约束。
import sys
import angr
class Program:
def __init__(self, exe):
self.exe = exe
self.project = angr.Project(exe, load_options={'auto_load_libs': False})
def set_input(self, arg):
self.arg1 = arg
self.initial_state = self.project.factory.entry_state(
args=[self.exe, self.arg1],
# does not seem to startup the unicorn engine
add_options=angr.options.unicorn,
)
def run(self):
state = self.initial_state
while True:
succ = state.step()
if len(succ.successors) > 1:
raise Exception('more successors %d' % len(succ.successors))
if not succ.successors: return state
state, = succ.successors
def main(exe, arg):
prog = Program(exe)
prog.set_input(arg)
res = prog.run()
print("constraints: %d" % len(res.solver.constraints))
for i in res.solver.constraints:
print(i)
print('done')
if __name__ == '__main__':
assert len(sys.argv) >= 3
main(sys.argv[1], sys.argv[2])
当我执行这个时,我得到零约束。
使用受约束的符号参数
我的想法是我应该使参数具有符号性,并使用 state.add_constraints 为输入中的每个字符添加约束,并查找最后产生的约束。
import sys
import angr
import claripy
class Program:
ARG_PREFIX = 'sym_arg'
def __init__(self, exe):
self.exe = exe
self.project = angr.Project(exe, load_options={'auto_load_libs': False})
def set_input(self, arg):
# generate arg1 from individual characters.
self.arg1 = self.make_symbolic_char_args(arg)
self.initial_state = self.project.factory.entry_state(
args=[self.exe, self.arg1],
add_options=angr.options.unicorn,
# does not seem to affect the number of constraints created
# remove_options=angr.options.simplification
)
self.constrain_input_chars(self.initial_state, self.arg1a, arg)
self.string_terminate(self.initial_state, self.arg1a, arg)
def string_terminate(self, state, symarg, inarg):
state.add_constraints(symarg[len(inarg)] == 0)
def constrain_input_chars(self, state, symarg, sarg):
constraints = []
for i,a in enumerate(sarg):
state.add_constraints(symarg[i] == a)
def run(self):
state = self.initial_state
while True:
succ = state.step()
if len(succ.successors) > 1:
raise Exception('more successors %d' % len(succ.successors))
if not succ.successors: return state
state, = succ.successors
def make_symbolic_char_args(self, instr, symbolic=True):
if not symbolic: return instr
input_len = len(instr)
largs = range(0, input_len+1)
arg1k = ['%s_%d' % (Program.ARG_PREFIX, i) for i in largs]
self.arg1h = {k:claripy.BVS(k, 8) for k in arg1k}
self.arg1a = [self.arg1h[k] for k in arg1k]
return reduce(lambda x,y: x.concat(y), self.arg1a)
def main(exe, arg):
prog = Program(exe)
prog.set_input(arg)
res = prog.run()
print("constraints: %d" % len(res.solver.constraints))
for i in res.solver.constraints:
print(i)
print('done')
if __name__ == '__main__':
assert len(sys.argv) >= 3
main(sys.argv[1], sys.argv[2])
虽然这最终会导致大量约束,但我不知道如何将这些约束解释为我想要的——与输入字符的比较。约束相当复杂,但每个项目中有很多重复的部分。将如下所示的这些约束转换为实际字符比较的任何建议将不胜感激。
...
<Bool (((if (sym_arg_0_0_8 == 0) then 0x0 else (if (sym_arg_1_1_8 == 0) then 0x1 else (if (sym_arg_2_2_8 == 0) then 0x2 else (if (sym_arg_3_3_8 == 0) then 0x3 else 0x4)))) + 0x7fffffffffeffd4) - 0x7fffffffffeffd4) == strlen_5_64>
<Bool 0x1 <= strlen_5_64>
<Bool ((0#56 .. 0#7 .. __invert__((if ((sym_arg_0_0_8 == 40) && (sym_arg_0_0_8[7:7] == 0)) then 1 else 0))) & 0xff) != 0x0>
<Bool (((if (sym_arg_0_0_8 == 0) then 0x0 else (if (sym_arg_1_1_8 == 0) then 0x1 else (if (sym_arg_2_2_8 == 0) then 0x2 else (if (sym_arg_3_3_8 == 0) then 0x3 else 0x4)))) + 0x7fffffffffeffd4) - 0x7fffffffffeffd4) == strlen_6_64>
<Bool 0x1 <= strlen_6_64>
<Bool ((0#56 .. 0#7 .. (if ((sym_arg_0_0_8 == 49) && (sym_arg_0_0_8[7:7] == 0)) then 1 else 0)) & 0xff) != 0x0>
<Bool (((if (sym_arg_0_0_8 == 0) then 0x0 else (if (sym_arg_1_1_8 == 0) then 0x1 else (if (sym_arg_2_2_8 == 0) then 0x2 else (if (sym_arg_3_3_8 == 0) then 0x3 else 0x4)))) + 0x7fffffffffeffd4) - 0x7fffffffffeffd4) == strlen_7_64>
<Bool 0x2 <= strlen_7_64>
<Bool ((0#56 .. 0#7 .. (if ((sym_arg_1_1_8 == 49) && (sym_arg_1_1_8[7:7] == 0)) then 1 else 0)) & 0xff) == 0x0>
<Bool (((if (sym_arg_0_0_8 == 0) then 0x0 else (if (sym_arg_1_1_8 == 0) then 0x1 else (if (sym_arg_2_2_8 == 0) then 0x2 else (if (sym_arg_3_3_8 == 0) then 0x3 else 0x4)))) + 0x7fffffffffeffd4) - 0x7fffffffffeffd4) == strlen_8_64>
<Bool 0x2 <= strlen_8_64>
...
问题:太慢
即使输入几个字符长,上述方法也需要很长时间才能执行。我尝试在简单的递归下降解析器上对输入几个字符的表达式执行上述操作:( 1+4/1+4/(1+4)
)。解析花了30多分钟!,而直接运行解析器不到一秒就出来了。如果没有其他出路,我会接受那个时间,但这是concolic,我无法摆脱必须有更好方法的感觉。
使用不受约束的符号参数,但探索我自己的
这个想法是我从输入约束开始,使得每个符号字符等于相应的输入。然后我提供我自己的执行路径,以便只选择满足输入约束的路径进行进一步执行。我的想法是,由于我没有在初始状态下开始原始字符约束,我可以检查最终状态以确定在每个字符上累积的其他约束。
import claripy
import sys
import angr
class Program:
ARG_PREFIX = 'sym_arg'
def __init__(self, exe):
self.exe = exe
self.project = angr.Project(exe, load_options={'auto_load_libs': False})
def set_input(self, arg):
# generate arg1 from individual characters.
self.arg1 = self.make_symbolic_char_args(arg)
self._constraints = []
self.constrain_input_chars(self.arg1a, arg)
self.string_terminate(self.arg1a, arg)
self.constraints = claripy.And(*self._constraints)
self.initial_state = self.project.factory.entry_state(
args=[self.exe, self.arg1],
add_options=angr.options.unicorn,
)
def string_terminate(self, symarg, inarg):
self._constraints.append(symarg[len(inarg)] == 0)
def constrain_input_chars(self, symarg, sarg):
for i,a in enumerate(sarg):
self._constraints.append(symarg[i] == a)
def get_satisfying_state(self, state):
succ = state.step()
if not succ.successors: return None
result = [i.satisfiable(extra_constraints=(self.constraints,)) for i in succ.successors]
assert len(result) == 1
return result[0]
def run(self):
state = self.initial_state
while True:
newstate = self.get_satisfying_state(state)
if not newstate: return state
def make_symbolic_char_args(self, instr, symbolic=True):
if not symbolic: return instr
input_len = len(instr)
largs = range(0, input_len+1)
arg1k = ['%s_%d' % (Program.ARG_PREFIX, i) for i in largs]
self.arg1h = {k:claripy.BVS(k, 8) for k in arg1k}
self.arg1a = [self.arg1h[k] for k in arg1k]
return reduce(lambda x,y: x.concat(y), self.arg1a)
def main(exe, arg):
prog = Program(exe)
prog.set_input(arg)
res = prog.run()
print("constraints: %d" % len(res.solver.constraints))
for i in res.solver.constraints:
print(i)
print('done')
if __name__ == '__main__':
assert len(sys.argv) >= 3
main(sys.argv[1], sys.argv[2])
这再次不起作用,因为即使解析几个字符也需要很长时间。
我考虑过的其他选择- 编写一个内存挂钩,监控何时读取输入字符串内存。但这对我不起作用,因为在比较之前可能会复制字符。
我现在已经多次阅读angr 文档,但不知道该怎么做。我不能动摇它应该更简单的感觉,因为angr是一个支持concolic 执行的符号执行引擎,而我正在做的几乎是普通的vanila concolic 执行。我接下来可以做什么?
任何帮助将不胜感激!。