forked from sirob/Ondn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_pull.py
62 lines (53 loc) · 1.72 KB
/
web_pull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from GA_ondn import catch_data, file_to_string, clear_subcat
import urllib.request
import re
import html.parser
def get_article_list(filename):
''' (str) -> tuple
Returns a tuple containing a part of url and number of pageviews
extracted from the csv file.
'''
text = file_to_string(filename)
tup = catch_data(text)
clear_tup = clear_subcat(tup)
return clear_tup[2]
def get_urls(filename):
''' (str) -> list of str
Returns a list of www.dnevnik.si full urls.
'''
li = get_article_list(filename)
urls = [('http://www.dnevnik.si') + x[0] for x in li]
return urls
def get_pageviews(filename):
''' (str) -> list of str
Returns a list of pageviews for each of the urls from csv file.
'''
li = get_article_list(filename)
pageviews = [x[1] for x in li]
return pageviews
def get_title(url):
''' (str) -> str
Gets the source code of a webpage of a given url and returns
the title of the page, extracted with regular expressions.
'''
response = urllib.request.urlopen(url)
html = response.read()
text = html.decode('utf-8')
match = re.search("<title>(.+)<", text)
return match.group(1)
def get_titles(filename):
''' (str) -> list of str
Returns the list of webpage titles from the urls in the csv file.
'''
h = html.parser.HTMLParser()
titles_non = [get_title(x) for x in get_urls(filename)]
titles = [h.unescape(x) for x in titles_non]
return titles
if __name__ == "__main__":
filename = input("Path to file:")
urls = get_urls(filename)
titles = get_titles(filename)
pageviews = get_pageviews(filename)
titles_pageviews = zip(titles, pageviews)
for x in titles_pageviews:
print(x)