File size: 4,632 Bytes
cbfae43
7dfd5db
6c6ef93
9c9d121
7dfd5db
2144fdb
3b930a2
 
 
 
771a832
2144fdb
f37ed6f
e62c0db
83b6b2f
e62c0db
 
7a1d957
 
 
 
 
 
 
 
 
 
 
 
 
 
5767a00
6c6ef93
e62c0db
4f80c95
3b930a2
93bd79b
 
 
 
 
 
 
 
 
 
 
 
 
83b6b2f
9c9d121
fc9ddd0
84443f7
87253bd
 
 
 
cac8a90
a39d8f4
7a65a6f
9fac792
 
fb01acd
9fac792
 
 
 
fb01acd
fc46b6a
9fac792
a39d8f4
822d8b4
33dfc83
 
9f58c2e
 
fb01acd
c92251d
6ce6744
 
 
 
 
 
f900015
 
33dfc83
83b6b2f
6c6ef93
83b6b2f
 
6c6ef93
c92251d
83b6b2f
6c6ef93
83b6b2f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import frontmatter
import gradio as gr
import json
import spaces

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "AverageBusinessUser/aidapal"
filename = "aidapal-8k.Q4_K_M.gguf"

print("Downloading model")

tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
model = AutoModelForCausalLM.from_pretrained(
    model_id, gguf_file=filename, device_map="auto"
)

example = """int __fastcall sub_B0D04(int a1, int a2)
{
  unsigned int v2; // r4
  int result; // r0

  v2 = a1 + a2;
  if ( __CFADD__(a1, a2) )
    return 0;
  result = _libc_alloca_cutoff();
  if ( v2 <= 0x1000 )
    return result | 1;
  return result;
}"""

examples = [json.loads(line)["input"] for line in open("gpt4_juiced_dataset.json", "r")]

# Then create the pipeline with the model and tokenizer
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

# TEMPLATE """{{ .System }}
# [INST]
# {{ .Prompt }}
# [/INST]
# """
# SYSTEM """<s>[INST]You are an expert at analyzing code that has been decompiled with IDA Hex Rays into IDA Hex Rays pseudocode. As a IDA Hex Rays pseudocode analyzer, you will be provided code that may or may not have symbols and variable names. You will analyze the IDA Hex Rays pseudocode and explain exactly what each line is doing. Then you will review your analysis and determine potential name for the function and variables within the function. Your task is use your knowledge of reverse engineering, IDA Hex Rays pseudocode, and C to assist the user with analysis and reverse engineering. Provide a detailed description of the Hex Rays pseudocode to the user explaining what the code does, suggest a function name based on the analysis of the pseudocode, and new variable names based on the analysis of the code. Only respond with valid JSON using the keys 'function_name','comment', and an array 'variables'. Values should use plain ascii with no special characters.
# Analyze the following IDA Hex Rays pseudocode and generate a valid JSON object containing the keys 'function_name','comment', and an array 'variables' explaining what the code does, suggest a function name based on the analysis of the code, and new variable names based on the analysis of the code.[/INST]</s>
# """

system = """<s>[INST]You are an expert at analyzing code that has been decompiled with IDA Hex Rays into IDA Hex Rays pseudocode. As a IDA Hex Rays pseudocode analyzer, you will be provided code that may or may not have symbols and variable names. You will analyze the IDA Hex Rays pseudocode and explain exactly what each line is doing. Then you will review your analysis and determine potential name for the function and variables within the function. Your task is use your knowledge of reverse engineering, IDA Hex Rays pseudocode, and C to assist the user with analysis and reverse engineering. Provide a detailed description of the Hex Rays pseudocode to the user explaining what the code does, suggest a function name based on the analysis of the pseudocode, and new variable names based on the analysis of the code. Only respond with valid JSON using the keys 'function_name','comment', and an array 'variables'. Values should use plain ascii with no special characters.
Analyze the following IDA Hex Rays pseudocode and generate a valid JSON object containing the keys 'function_name','comment', and an array 'variables' explaining what the code does, suggest a function name based on the analysis of the code, and new variable names based on the analysis of the code.[/INST]</s>
"""


@spaces.GPU
def predict(code):
    prompt = f"""{system}
[INST]
{code}
[/INST]
"""

    print(f"Prompt: {repr(prompt)}")
    print(f"Tokenized: {tokenizer.tokenize(prompt)}")
    pipe_out = pipe(
        prompt,
        do_sample=True,
        top_k=100,
        top_p=0.09,
        temperature=1.2,
        repetition_penalty=1.1,
        return_full_text=False,
        max_length=4096,
    )
    print(f"Pipe out: {repr(pipe_out)}")

    raw_output = pipe_out[0]["generated_text"]
    output = raw_output
    if output.startswith("```json\n"):
        output = output[8:]

    json_output = json.dumps([])
    try:
        json.loads(output)
        json_output = output
    except Exception:
        pass

    print(f"JSON output: {repr(json_output)}")

    return json_output, raw_output


demo = gr.Interface(
    fn=predict,
    inputs=gr.Text(label="Hex-Rays decompiler output"),
    outputs=[gr.JSON(label="Aidapal Output as JSON"), gr.Text(label="Raw Aidapal Output")],
    description=frontmatter.load("README.md").content,
    examples=examples
)
demo.launch()