-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
90 lines (72 loc) · 3.97 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import config
import bingapi
import xmltodict
from datetime import datetime,timedelta
def getall(country_code, countries, language, keywords, fromyear, frommonth, fromday, toyear, tomonth, today):
# Get Historical Search Count #################################################################################
historical_search_count = api.service.GetHistoricalSearchCount(
Keywords=bing.soapStrArr(api, keywords),
Language=language,
PublisherCountries=bing.soapStrArr(api, countries),
Devices=bing.soapStrArr(api, ["Computers", "NonSmartphones", "Smartphones", "Tablets"]),
StartDate=bing.soapDate(api, fromyear, frommonth, fromday),
EndDate=bing.soapDate(api, toyear, tomonth, today),
TimePeriodRollup="Daily"
)
search_count_dict = xmltodict.parse(historical_search_count)
jsonout = bing.toPrettyJson(search_count_dict['s:Envelope']['s:Body']['GetHistoricalSearchCountResponse'])
with open('HistoricalSearchCount_{3}/{0}{1}{2}.json'.format(fromyear,frommonth,fromday,country_code), 'w+') as f:
f.write(jsonout)
# Get Demographics UK #################################################################################
demographics_request = api.service.GetKeywordDemographics(
Keywords=bing.soapStrArr(api, keywords),
Devices=bing.soapStrArr(api, ["Computers", "NonSmartphones", "Smartphones", "Tablets"]),
Language=language,
PublisherCountries=bing.soapStrArr(api, countries)
)
demographics_request_dict = xmltodict.parse(demographics_request)
jsonout = bing.toPrettyJson(demographics_request_dict['s:Envelope']['s:Body']['GetKeywordDemographicsResponse'])
with open('KeywordDemographics_{3}/{0}{1}{2}.json'.format(fromyear,frommonth,fromday,country_code), 'w+') as f:
f.write(jsonout)
# Get Locations #################################################################################
for level in xrange(1,4):
locations_request = api.service.GetKeywordLocations(
Keywords=bing.soapStrArr(api, keywords),
Devices=bing.soapStrArr(api, ["Computers", "NonSmartphones", "Smartphones", "Tablets"]),
Language=language,
Level=level,
ParentCountry=country_code,
PublisherCountries=bing.soapStrArr(api, countries)
)
locations_request_dict = xmltodict.parse(locations_request)
jsonout = bing.toPrettyJson(locations_request_dict['s:Envelope']['s:Body']['GetKeywordLocationsResponse'])
with open('KeywordLocations_{4}/{0}{1}{2}_{3}.json'.format(fromyear,frommonth,fromday,level,country_code), 'w+') as f:
f.write(jsonout)
##################################################################################################################
##################################################################################################################
##################################################################################################################
bing = bingapi.bingapi()
api = bing.getapi(config.developertoken, config.customerid, config.accountid, config.clientid, config.clientsecret)
# read keywords
with open("keywords.gb") as f:
keywordsuk = map(str.strip, f.readlines())
with open("keywords.dk") as f:
keywordsdk = []
for l in f:
keywordsdk.append(l.decode("utf8").strip())
# not sure exactly when the api makes data available. values (counts etc) may be simply be null if the data isn't
# ready yet, so simply fetch the data, and overwrite the last couple of days' files as well, just to make sure
# that "empty" files are overwritten by files with useful data. Be aware of this when actually using the data!
for i in xrange(0, 3):
yday = datetime.now() - timedelta(days=i)
tday = datetime.now() - timedelta(days=i)
fromyear = yday.strftime("%Y")
frommonth = yday.strftime("%m")
fromday = yday.strftime("%d")
toyear = tday.strftime("%Y")
tomonth = tday.strftime("%m")
today = tday.strftime("%d")
getall('GB', ['GB'], 'English', keywordsuk, fromyear, frommonth, fromday, toyear, tomonth, today)
getall('DK', ['DK'], 'Danish', keywordsdk, fromyear, frommonth, fromday, toyear, tomonth, today)