-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
127 lines (116 loc) · 4.57 KB
/
utils.py
File metadata and controls
127 lines (116 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import re
from io import StringIO
import tokenize
def remove_comments_and_docstrings(source,lang):
if lang in ['python']:
"""
Returns 'source' minus comments and docstrings.
"""
io_obj = StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
# Remove comments:
if token_type == tokenize.COMMENT:
pass
# This series of conditionals removes docstrings:
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
# This is likely a docstring; double-check we're not inside an operator:
if prev_toktype != tokenize.NEWLINE:
if start_col > 0:
out += token_string
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
temp=[]
for x in out.split('\n'):
if x.strip()!="":
temp.append(x)
return '\n'.join(temp)
elif lang in ['ruby']:
return source
else:
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
temp=[]
for x in re.sub(pattern, replacer, source).split('\n'):
if x.strip()!="":
temp.append(x)
return '\n'.join(temp)
def tree_to_token_index(root_node):
if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
return [(root_node.start_point,root_node.end_point)]
else:
code_tokens=[]
for child in root_node.children:
code_tokens+=tree_to_token_index(child)
return code_tokens
def tree_to_token_index_pro(root_node):
'''
# 2022-06-20 yangkang 在原始tree_to_token_index 方法中 增加了 一个 mapping list
用于 保存 叶子 tokens,便于后面做映射, 增加 DFG 的边 到 AST的 adj 中
因为 DFG 的节点,只有叶子节点,而 AST adjacency matrix 的节点,包括了 叶子以及中间节点
:param root_node: AST的根节点
:return: mapping 存储的是 原始代码中的 token 对应到 AST 中 的叶子节点的 list,mapping中,无中间节点
code_tokens ((0, 0), (0, 3)), ((0, 4), (0, 10)), ... ,((0, 10), (0, 11)), ((4, 15), (4, 16))]
原始代码 的 token 在 程序片断中的起始点坐标,二元组的 list
'''
mapping = list()
def tree_to_token_index(root_node):
if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment':
mapping.append(root_node)
return [(root_node.start_point, root_node.end_point)]
else:
code_tokens = []
for child in root_node.children:
code_tokens += tree_to_token_index(child)
return code_tokens
code_tokens = tree_to_token_index(root_node)
return mapping, code_tokens
def tree_to_variable_index(root_node,index_to_code):
if (len(root_node.children)==0 or root_node.type=='string') and root_node.type!='comment':
index=(root_node.start_point,root_node.end_point)
_,code=index_to_code[index]
if root_node.type!=code:
return [(root_node.start_point,root_node.end_point)]
else:
return []
else:
code_tokens=[]
for child in root_node.children:
code_tokens+=tree_to_variable_index(child,index_to_code)
return code_tokens
def index_to_code_token(index,code):
start_point=index[0]
end_point=index[1]
if start_point[0]==end_point[0]:
s=code[start_point[0]][start_point[1]:end_point[1]]
else:
s=""
s+=code[start_point[0]][start_point[1]:]
for i in range(start_point[0]+1,end_point[0]):
s+=code[i]
s+=code[end_point[0]][:end_point[1]]
return s