-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocessors.py
More file actions
161 lines (120 loc) · 5.68 KB
/
processors.py
File metadata and controls
161 lines (120 loc) · 5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#-*- coding: utf-8 -*-
# from __future__ import print_function
# import re, sys, html, os, HTMLParser #Python2 (server)
from html.parser import HTMLParser #Python3 (request)
import sys, re, os, html
sys.path.append(".")
sys.path.append("..")
sys.path.append("...")
sys.path.append('path_for_nltk')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from string import punctuation
def handleUnderScore(token_list):
token_list = getTokens(token_list)
underScored = list()
for i in token_list:
listOfCC = i.split('_')
underScored.extend(listOfCC)
if i not in listOfCC:
underScored.append(i)
return underScored
def getTokens(re):
#tokenizer = RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'\S+')
tokens = tokenizer.tokenize(re)
# tokens = preSpecialCharRemove(str(tokens))
return tokens
def camel_case_split(identifier):
matches = re.finditer('.+?(?:(?<=[tmp-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][tmp-z])|$)', identifier)
res = [m.group(0) for m in matches]
return res
def removeEndingPunct(token_list):
stripped = [i.strip(punctuation) for i in token_list]
return str(stripped)
def charLength(x, l=3):
if x.isalpha() and len(x) >= l:
return True
else:
return False
def preSpecialCharRemove(text):
special_char_list = ['|', '=', '/', '(', ')', '[', ']', ',', '.', '\'', '`', '!', '@', '#', '$', '%', '^', '&',
'*', '+', '-', '<', '>', '?', ':', ';', '{', '}', '"', '\\', '~']
for s_char in special_char_list:
text = text.replace(s_char, ' ')
return text
def handleCamelCase(tokens):
camelCased = list()
# token_list = getTokens(text)
for i in tokens:
listOfCC = camel_case_split(i)
camelCased.extend(listOfCC)
# text = " ".join(camelCased)
return camelCased
def stem(tokens):
# tokens = getTokens(text)
p_stemmer = PorterStemmer()
stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
# text = " ".join(stemmed_tokens)
return stemmed_tokens
def preStopRemoval(tokens):
# tokenlist = getTokens(text)
stopped_tokens = [i for i in tokens if not i in en_stop]
stopped_tokens = [i for i in stopped_tokens if not i in my_stop_words]
# text = " ".join(stopped_tokens)
return stopped_tokens
def lower_(tokens):
# token_list = getTokens(text)
lowered_tokens = [i.lower() for i in tokens]
# text = " ".join(lower)
return lowered_tokens
def preBasic(text):
# text = HTMLParser.HTMLParser().unescape(text) # Indexing or Server
text = html.unescape(text) # Request
# tokens = getTokens(text)
return text
def SPC(text):
# text = lower_(tokens)
return preSpecialCharRemove(text)
def CMC(tokens):
text = handleCamelCase(tokens)
return lower_(text)
def STM(tokens):
text = lower_(tokens)
return stem(text)
def SWR(tokens):
text = lower_(tokens)
return preStopRemoval(text)
def postBasic(tokens):
nonDigit = [i for i in tokens if (not i.isdigit())]
final_tokens = [i for i in nonDigit if not i in my_stop_words]
text = " ".join(final_tokens)
return text
if __name__ == "__main__":
text = """2014-12-29 22:25:12,667 | ERROR | FelixStartLevel | BlueprintCamelContext | 8 - org.apache.camel.camel-blueprint - 2.14.0 | Error occurred during starting Camel: CamelContext(elasticSearchProducerCamelContext) due Failed to create route log-event-sink-elasticsearch at: &gt;&gt;&gt; Aggregate[true -&gt; [To[log:xxx?level=INFO&amp;groupInterval=2000], To[elasticsearch://elasticsearch?ip=127.0.0.1&amp;port=9300]]] &lt;&lt;&lt; in route: Route(log-event-sink-elasticsearch)[[From[vm:log-event-elast... because of Failed to resolve endpoint: elasticsearch://elasticsearch?ip=127.0.0.1&amp;port=9300 due to: Failed to resolve config path [names.txt], tried file path [names.txt], path file [xxx/config/names.txt], and classpath
org.apache.camel.FailedToCreateRouteException: Failed to create route log-event-sink-elasticsearch at: &gt;&gt;&gt; Aggregate[true -&gt; [To[log:xxx?level=INFO&amp;groupInterval=2000], To[elasticsearch://elasticsearch?ip=127.0.0.1&amp;port=9300]]] &lt;&lt;&lt; in route: Route(log-event-sink-elasticsearch)[[From[vm:log-event-elast... because of Failed to resolve endpoint: elasticsearch://elasticsearch?ip=127.0.0.1&amp;port=9300 due to: Failed to resolve config path [names.txt], tried file path [names.txt], path file [xxx/config/names.txt], and classpath
at org.apache.camel.model.RouteDefinition.addRoutes(RouteDefinition.java:945)
at org.apache.camel.model.RouteDefinition.addRoutes(RouteDefinition.java:187)
...
Caused by: org.elasticsearch.env.FailedToResolveConfigException: Failed to resolve config path [names.txt], tried file path [names.txt], path file [xxx/config/names.txt], and classpath
at org.elasticsearch.env.Environment.resolveConfig(Environment.java:213)
at org.elasticsearch.node.internal.InternalSettingsPreparer.prepareSettings(InternalSettingsPreparer.java:119)
...
This can be fixed by adding tmp one-liner to explicitly set the ClassLoader on the elasticSearch Settings class to the classloader of Settings.class:
Settings settings = ImmutableSettings.settingsBuilder()
...
.classLoader(Settings.class.getClassLoader());
..."""
'''
Kinds
[B] baseline
[SPC] split by special char
[CMC] split by camel case
[STM] stemming
[SWR] stop words removal
'''
print(SPC(text))
print(CMC(text))
print(STM(text))
print(SWR(text))