-
Notifications
You must be signed in to change notification settings - Fork 0
/
Extracting_links.py
152 lines (137 loc) · 6.52 KB
/
Extracting_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from __future__ import print_function
import random
import time # for adding sleeping time
import requests #sending requests
import json
import datetime
import re # librarz for regular expressions
import pytz
import sys #redirecting links to a file
import os #define the current working directory
from bs4 import BeautifulSoup # working with HTML file
from pytz import timezone # working with timezones
def main():
os.chdir("C:\\Farshid\\Heidelberg\\third semester\\practical 2\\textFile") # defining the current directory
symbol_list = ['APC', 'XEC', 'CXO', 'COP', 'DVN', 'EOG', 'EQT', 'MRO', 'NFX', 'NBL', 'OXY',
'OKE','PXD','RRC','SWN','WMB','CHK', 'CVX', 'XOM', 'HES', 'MUR', 'KMI', 'MPC',
'PSX', 'SE', 'TSO', 'VLO']
#symbol_list = ['XEC','TSO']
for symbol in symbol_list:
print('='*10)
print('loading %s......\n' % symbol)
unique_id, timestamp, title = load_params_from_html(symbol)
# for debug
# unique_id, timestamp, title = '765715675', '6:00 p.m. Nov. 03, 1997', 'testing'
print('got unique_id=%r, from %r' % (unique_id, title))
baseUrl = 'http://www.marketwatch.com/news/headline/getheadlines'
parameters = extract_first_parameters(unique_id, symbol, timestamp)
parameters.update(extract_uid(unique_id))
page_cnt = 0
while True:
page_cnt += 1
resp = requests.get(baseUrl, params = parameters)
if resp.status_code != 200:
print("exception! status=%s, response:%s" % (resp.status_code, resp.text))
break
data = json.loads(resp.text) # array of size 10
print('\n--- page:%d' % page_cnt)
print("got %d data, with args: " % (len(data)))
print("%17s: %s" % ('messageNumber', parameters['messageNumber']))
print("%17s: %s" % ('dateTime', parameters['dateTime']))
print("%17s: %s" % ('sequence', parameters['sequence']))
print("%17s: %s" % ('docId', parameters['docId']))
# Section 1: Save links to file
newsLink = []
filename = symbol + '.txt'
with open(filename, 'a') as fp:
for item in data:
link = "http://www.marketwatch.com/story" + item['SeoHeadlineFragment']
newsLink.append(link)
if len(newsLink):
fp.write('\n'.join(newsLink) + '\n')
# Section 2: print results, and extract parameters
if len(data) > 1:
first = data[0] # get first item of array
last = data[-1] # get last item of array
print("results:\n\tfirst: %-42s, %s" % (first['UniqueId'], first['HeadlineText']))
print("\t last: %-42s, %s" % (last['UniqueId'], last['HeadlineText']))
uid = last['UniqueId'] # get value of UniqueId from dict object `last`
TimestampUtc = last['TimestampUtc'] # get the value of TimestampUtc
offset = last['EasternTimeOffset']
parameters.update(extract_uid(uid))
parameters['dateTime'] = extract_link_timeStamp(TimestampUtc, offset)
# raw_input("press <enter> to get next")
time.sleep(random.randint(5, 9))
# Section 3: check continuing condition
if len(data) < 10:
break
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# this function loads the first page and returns parameters for sending the next request
def load_params_from_html(symbol):
html_url = 'http://www.marketwatch.com/investing/stock/' + symbol
resp = requests.get(html_url)
if resp.status_code != 200:
raise Exception("http request failed: %s" % resp)
soup = BeautifulSoup(resp.text, 'lxml')
# get value of `data-uniqueid` from last news node of 'MarketWatch News on XOM'
li_node = soup.select("#mwheadlines > div.headlinewrapper > ol > li[data-uniqueid]")[1]
unique_id = li_node['data-uniqueid']
timestamp = li_node.select('.timestamp')[0].text
return unique_id, timestamp, li_node.text.replace('\n', ' ').strip()
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# A function that extracts required parameters for for sending requests
def extract_uid(uid):
sequence = ''
messageNumber = ''
docId = ''
if ':' in uid: # if the symbol ':' in string `uid`
# uid looks like `e5a00f51-8821-4fbc-8ac6-e5f64b5eb0f2:8499`
# so split it by ':'
sequence, messageNumber = uid.split(':')
else:
docId = uid
return {
'sequence': sequence,
'messageNumber': messageNumber,
'docId': docId,
}
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# extract the timestamp of the links with integer UniqieIds
def extract_link_timeStamp(TimestampUtc, offset):
pst = timezone('America/New_York')
TimestampUtc = re.split('\(|\)', TimestampUtc)[1]
TimestampUtc = int(TimestampUtc)/1000
date = datetime.datetime.utcfromtimestamp(TimestampUtc)
newDate = date + datetime.timedelta(hours=int(offset))
dateTime = newDate.strftime('%I:%M %p %b. %d, %Y').replace('AM', 'a.m.').replace('PM', 'p.m.').lstrip('0')
return(dateTime)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# different symbols start with one of two differen 'data-uniqueid',
# the following function returns the first uniqe_id of the news links to send a new request
def extract_first_parameters(first_uid, symbol, timestamp):
parameters = {
'ticker':symbol,
'countryCode':'US',
'docType':'806',
'count':'10',
'channelName': '/news/latest/company/us/' + symbol,
'count':'10',
'channelName': '/news/latest/company/us/xom',
'dateTime': timestamp,
}
if ':' in first_uid:
sequence, messageNumber = first_uid.split(':')
parameters.update({
'docId': '', # (Optional) initial value extract from HTML page
'sequence':sequence, # initial value extract from HTML page
'messageNumber':messageNumber, # initial value extract from HTML page
})
else:
parameters.update({
'docId': first_uid, # (Optional) initial value extract from HTML page
'sequence' : '',
'messageNumber' : ''
})
return parameters
if __name__ == '__main__':
main()