-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_idx_db.py
More file actions
38 lines (33 loc) · 1.48 KB
/
create_idx_db.py
File metadata and controls
38 lines (33 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# Generate the list of index files archived in EDGAR since start_year (earliest: 1993) until the most recent quarter
import datetime
#current_year = datetime.date.today().year
current_year = 2010
current_quarter = (datetime.date.today().month - 1) // 3 + 1
start_year = 1996
years = list(range(start_year, current_year))
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
history = [(y, q) for y in years for q in quarters]
for i in range(1, current_quarter + 1):
history.append((current_year, 'QTR%d' % i))
urls = ['https://www.sec.gov/Archives/edgar/full-index/%d/%s/crawler.idx' % (x[0], x[1]) for x in history]
urls.sort()
# Download index files and write content into SQLite
import sqlite3
import requests
con = sqlite3.connect('edgar_htm_idx.db')
cur = con.cursor()
cur.execute('DROP TABLE IF EXISTS idx')
cur.execute('CREATE TABLE idx (conm TEXT, type TEXT, cik TEXT, date TEXT, path TEXT)')
for url in urls:
lines = requests.get(url).text.splitlines()
nameloc = lines[7].find('Company Name')
typeloc = lines[7].find('Form Type')
cikloc = lines[7].find('CIK')
dateloc = lines[7].find('Date Filed')
urlloc = lines[7].find('URL')
records = [tuple([line[:typeloc].strip(), line[typeloc:cikloc].strip(), line[cikloc:dateloc].strip(),
line[dateloc:urlloc].strip(), line[urlloc:].strip()]) for line in lines[9:]]
cur.executemany('INSERT INTO idx VALUES (?, ?, ?, ?, ?)', records)
print(url, 'downloaded and wrote to SQLite')
con.commit()
con.close()