pycode/scrap_bs.py at master · andjelx/pycode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import unicodecsv as csv
import re

INITIAL_URL='http://www.creprice.cn/'
#http://www.creprice.cn/market/distrank/city/zy.html?flag=1
DATAURL='http://www.creprice.cn/market/distrank/city/'

areas = dict()

def getAreasList(url):
  results = dict()
  resp  = requests.get(url)
  soup = BeautifulSoup(resp.content, "lxml")

  for span in soup.find_all('span'):
      if span.get('code'):
         results[span.get('code')] = span.string

  return results

def is_month(href):
        return href and re.compile("month").search(href)

def getAreaDataRange(area):
  url = DATAURL + area + '.html?flag=1'
  resp  = requests.get(url)
  soup = BeautifulSoup(resp.content, "lxml")

  results = []
  for link in soup.find_all(href=is_month):
      results.append(link.get('href'))

  return results

def is_td(tag):
    return tag.parent.name == 'tbody' and tag.name == 'tr'

def getAreaData(area, url):
  resp  = requests.get(url)
  soup = BeautifulSoup(resp.content, "lxml")
  year, month = (url.split('month=',1)[1]).split('-',1)

  results = dict()
  data = []
  for td in soup.find_all(is_td):
      for l in td.children:
        l.string.strip() and data.append(l.string.strip())
      date = data[0]+'/'+month+'/'+year
      results[date] = (date, area, data[1],data[2],data[4])

  return results

areas = getAreasList(INITIAL_URL)

f = open('report.csv', 'wb')
writer = csv.writer(f)

for key, value in areas.iteritems():
    print('Processing ' + key + ' ' + value)
    for url in getAreaDataRange(key):
        print('Processing url: ' + url)
        for line in getAreaData(value,url).itervalues():
            writer.writerow(line)