-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch.py
84 lines (63 loc) · 2.28 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import json
import os
from services.downloader import Downloader
from services.reader import read_json
from services.timer import Timer
from services.writer import Writer
from services.web_parser import extract_page
OVERWRITE = False
ITEMS_PATH = "./api/items.json"
CODES_PATH = "./api/codes.json"
downloader = Downloader()
amount_online = downloader.get_amount()
to_fetch = amount_online
print(f"Found {amount_online} documents on regeringen.se")
just_fetch_new = not OVERWRITE and os.path.exists(ITEMS_PATH)
timer = Timer()
if just_fetch_new:
codes = read_json(CODES_PATH)
items = read_json(ITEMS_PATH)
items.reverse()
stats = read_json("./api/latest_updated.json")
timer.set_latest_update(stats["latest_updated"])
amount_saved = stats["items"]
print(f"Found {amount_saved} existing items.")
print(f"Found {len(codes)} existing codes.")
delta = timer.get_delta()
to_fetch = abs(amount_online - amount_saved) + 10 + 5 * (delta - 1)
print(f"Fetching the latest {to_fetch} items...")
new_items, new_codes = downloader.get_latest_items(to_fetch)
new_items = [i for i in new_items if Downloader.last_updated(i) > timer.day_before()]
for new_item in new_items:
url = new_item["url"]
print(f"Fetching page at {url}...")
page = downloader.get_webpage(url)
md_content, metadata = extract_page(page)
Writer.write_md(md_content, "data/" + new_item["url"].strip("/") + ".md")
new_item.update(metadata)
# Lots the category here, only kept the ID!
for category in metadata["categories"]:
new_codes[str(category)] = category[1]
if just_fetch_new:
new_items.reverse()
new_urls = [item["url"] for item in new_items]
to_remove = []
for i, item in enumerate(items):
if item["url"] in new_urls:
to_remove.append(i)
for i in sorted(to_remove, reverse=True):
items.pop(i)
items.extend(new_items)
items.reverse()
codes.update(new_codes)
else:
items, codes = new_items, new_codes
codes = {str(key): codes[key] for key in sorted(codes)}
latest_updated = {
"latest_updated": timer.start_string(),
"items": len(items),
"codes": len(codes),
}
Writer.write_json(items, ITEMS_PATH)
Writer.write_json(codes, CODES_PATH)
Writer.write_json(latest_updated, "./api/latest_updated.json")