-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPubTator_read.py
More file actions
90 lines (73 loc) · 3.14 KB
/
PubTator_read.py
File metadata and controls
90 lines (73 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import json
import re
import requests
from xml.dom.minidom import parseString
def extract_xml(str_xml):
# take str into domtree
dom_tree = parseString(str_xml)
collection = dom_tree.documentElement
# find node
annotation = collection.getElementsByTagName("annotation")
with open("gene_name.csv", "a") as xml_opt:
for tmp in annotation:
name_node = tmp.getElementsByTagName("text")[0]
xml_opt.write(name_node.childNodes[0].data + "\n")
def extract_json(str_json):
with open("gene_name.csv", "a") as json_opt:
# take json into python obj
data = json.loads(str_json)
passages = data['passages']
for obj in passages:
tmp = obj['annotations']
for information in tmp:
now_infons = information['infons']
if now_infons['type'] == 'Gene':
json_opt.write(information['text'] + "\n")
def extract_pubtator(str_pubtator):
pattern = re.compile(r'([\-|\w]+.)Gene')
result = pattern.findall(str_pubtator)
with open("gene_name.csv", "a") as tator_opt:
for my_str in result:
tator_opt.write(str(my_str).rstrip() + "\n")
def process_file(file_format, file_type, process_list):
for process_id in process_list:
url = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/{}?{}={}&concepts=gene". \
format(file_format, file_type, process_id)
file = requests.get(url).text
if file_format == "pubtator":
extract_pubtator(file)
with open("{}_{}.tutor".format(file_format, process_id), "w") as f:
f.write(file)
elif file_format == "biocxml":
extract_xml(file)
with open("{}_{}.xml".format(file_format, process_id), "w") as f:
f.write(file)
elif file_format == "biocjson":
extract_json(file)
with open("{}_{}.json".format(file_format, process_id), "w") as f:
f.write(file)
print("extract all gene in documents!!!")
def process_string():
my_str = input("Please enter the article numbers you want to retrieve, separated by commas: ")
return my_str.split(",")
if __name__ == "__main__":
# Input the format
file_format = input("Input your need file format! 'tator' or 'xml' or 'json': ")
if file_format != 'tator' and file_format != 'xml' and file_format != 'json':
raise ValueError("Input error, please your input!")
format_convert = {
"tator": "pubtator",
"xml": "biocxml",
"json": "biocjson",
}
# Input the type
tator_type = input("Input the type! 'pmids' or 'pmcids': ")
if tator_type != 'pmids' and tator_type != 'pmcids':
raise ValueError("Input error, please your input!")
# Input the string to storage list
my_list = process_string()
assert type(my_list) == type([])
if file_format == 'tator' and tator_type == 'pmcids':
raise ValueError("pmcids can only be used to retrieve publications in biocxml or biocjson formats!!!")
# Start processing the file
process_file(format_convert[file_format], tator_type, my_list)