-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeature_extractor.py
More file actions
111 lines (79 loc) · 3.45 KB
/
feature_extractor.py
File metadata and controls
111 lines (79 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''
This module contains feature extractor for binary files.
It uses floss to extract the strings from the binary file and then it uses the strings dataset to create features vector.
The features are the if binary contains or not a string from the strings dataset (which created by looking at frequent strings in malwares).
'''
import json
import os
import subprocess
import sys
from strings_extractor import get_strings_from_binary, get_strings_from_output
def load_strings_dataset(strings_dataset_path):
'''
This function returns the dataset from the given strings_dataset file.
Args:
strings_dataset_path (str): The path to the strings_dataset file.
Returns:
dictionary: A dictionary of { string : {"suspicious_score": float, "frequency_score": float} }.
'''
with open(strings_dataset_path, 'r') as f:
return json.load(f)['strings']
def exctract_features_from_binary(binary_path, strings_dataset_path):
'''
This function returns the features vector of the given binary file.
Args:
binary_path (str): The path to the binary file.
strings_dataset_path (str): The path to the strings_dataset file.
Returns:
list: A list of features.
'''
strings = get_strings_from_binary(binary_path)
strings_dataset = load_strings_dataset(strings_dataset_path)
return get_features_vector(strings, strings_dataset)
def extract_features_from_output(output_path, strings_dataset_path):
'''
This function returns the features vector of the given output file.
Args:
output_path (str): The path to the output file.
strings_dataset_path (str): The path to the strings_dataset file.
Returns:
list: A list of features.
'''
strings = get_strings_from_output(output_path)
strings_dataset = load_strings_dataset(strings_dataset_path)
return get_features_vector(strings, strings_dataset)
def get_features_vector(strings, strings_dataset):
'''
This function returns the features vector of the given strings and strings_dataset.
Args:
strings (list): A list of DataString objects.
strings_dataset (dictionary): A dictionary of { string : {"suspicious_score": float, "frequency_score": float} }.
Returns:
list: A list of features.
'''
strings_dataset_list = list(strings_dataset.keys())
strings_features = [0] * len(strings_dataset_list)
# strings features - for each string in input file, check if it is in the dataset (so it suspicious)
for string in strings:
if string.string in strings_dataset_list:
strings_features[strings_dataset_list.index(string.string)] = 1
# type features - if the file contains decoded string, stack string or tight string
type_features = [0] * 3
for string in strings:
if string.type_of_extraction == 'decoded_string':
type_features[0] = 1
elif string.type_of_extraction == 'stack_string':
type_features[1] = 1
elif string.type_of_extraction == 'tight_string':
type_features[2] = 1
return strings_features + type_features
def get_num_of_features(strings_dataset_path):
'''
This function returns the number of features for each input file.
Args:
strings_dataset_path (str): The path to the strings_dataset file.
Returns:
int: The number of features.
'''
strings_dataset = load_strings_dataset(strings_dataset_path)
return len(strings_dataset) + 3