forked from allanlepp/te_rss
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rss_maker.py
161 lines (126 loc) · 7.96 KB
/
rss_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
RSS failisisu genereerimine.
"""
from lxml import etree
import parsers_common
import parsers_datetime
import rss_config
import rss_print
def rssmaker(dataset, titleText, domainText, linkText, descriptionText):
root = etree.Element("rss", version="2.0")
channel = etree.SubElement(root, "channel")
title = etree.SubElement(channel, "title")
title.text = titleText
link = etree.SubElement(channel, "link")
link.text = linkText
description = etree.SubElement(channel, "description")
description.text = descriptionText
# https://cyber.harvard.edu/rss/rss.html: "Sat, 07 Sep 2002 09:42:31 GMT" ehk https://tools.ietf.org/html/rfc822
curTimeFloat = parsers_datetime.time_float()
curTimeFloatLimit = curTimeFloat - (31 * 24 * 60 * 60)
curTimeFormat = "%a, %d %b %Y %H:%M:%S %z" # Fri, 17 May 2019 13:37:00 +0300
curGenerTime = parsers_datetime.float_to_datetime_rfc2822(curTimeFloat)
lastBuildDate = etree.SubElement(channel, "lastBuildDate")
lastBuildDate.text = curGenerTime
# https://cyber.harvard.edu/rss/rss.html#ltttlgtSubelementOfLtchannelgt:
# "Number of minutes that indicates how long a channel can be cached before refreshing from the source."
ttl = etree.SubElement(channel, "ttl")
ttl.text = str(12 * 60)
urlsLen = len(dataset["urls"])
urlsLenStr = str(urlsLen)
for i in range(urlsLen):
curKandeString = str(i + 1) + "/" + urlsLenStr
if parsers_common.get(dataset["urls"], i, printWarning=0):
item = etree.SubElement(channel, "item")
itemLink = etree.SubElement(item, "link")
curValue = dataset["urls"][i]
curValue = curValue.rstrip("/")
if not curValue.startswith("http"):
rss_print.print_debug(__file__, "kande(" + curKandeString + ") lingi algusest ei leitud 'http': " + curValue, 3)
curValue = parsers_common.str_domain_url(domainText, curValue)
if "http://" in curValue and "https://" in domainText:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") lingist leiti 'http://': " + curValue, 1)
curValue = curValue.replace("http://", "https://")
itemLink.text = curValue
# https://cyber.harvard.edu/rss/rss.html: A string that uniquely identifies the item.
itemGuid = etree.SubElement(item, "guid")
curValueWithoutHttp = parsers_common.str_lchop_url(curValue)
itemGuid.text = curValueWithoutHttp.encode('ascii', 'xmlcharrefreplace')
else:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") puudub vajalik url, katkestame", 0)
continue
if parsers_common.get(dataset["titles"], i, printWarning=0):
itemTitle = etree.SubElement(item, "title")
curValue = dataset["titles"][i]
curValue = curValue.replace("<br>", " ")
curValue = parsers_common.str_cleanup_title(curValue)
itemTitle.text = curValue.encode('ascii', 'xmlcharrefreplace')
else:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") aadressil puudub vajalik pealkiri: " + str(itemLink.text), 0)
itemTitle = etree.SubElement(item, "title")
itemTitle.text = title.text + " " + itemGuid.text
if parsers_common.get(dataset["descriptions"], i, printWarning=0):
itemDescription = etree.SubElement(item, "description")
curValue = dataset["descriptions"][i]
curValue = parsers_common.str_cleanup_post(curValue)
curValue = parsers_common.str_cleanup_description(curValue)
itemDescription.text = curValue.encode('ascii', 'xmlcharrefreplace')
else:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") pealkirjal puudub vajalik kirjeldus: '" + str(itemTitle.text) + "' - " + str(itemLink.text), 0)
itemDescription = etree.SubElement(item, "description")
itemDescription.text = itemTitle.text
if parsers_common.get(dataset["pubDates"], i, printWarning=0):
curValue = dataset["pubDates"][i]
postTimeFloat = parsers_datetime.raw_to_float(curValue, curTimeFormat)
if rss_config.UPLOAD_TIME_LIMIT is True and postTimeFloat < curTimeFloatLimit:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") '" + itemTitle.text + "' aeg: '" + curValue + "' on vanem kui 31 päeva, eemaldame kande", 2)
channel.remove(item)
if postTimeFloat < 1:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") '" + itemTitle.text + "' aeg: '" + curValue + "' on eelajalooline!, asendame hetkeajaga", 0)
curValue = parsers_datetime.float_to_datetime_rfc2822(curTimeFloat)
elif postTimeFloat > curTimeFloat:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") '" + itemTitle.text + "' aeg: '" + curValue + "' on tulevikust?", 1)
elif i == urlsLen and parsers_datetime.raw_to_float(dataset["pubDates"][i], curTimeFormat) < parsers_datetime.raw_to_float(dataset["pubDates"][i-1], curTimeFormat):
rss_print.print_debug(__file__, "kande(" + curKandeString + ") '" + itemTitle.text + "' aeg: '" + curValue + "' on eelmisest väiksem", 0)
itemPubdate = etree.SubElement(item, "pubDate") # https://cyber.harvard.edu/rss/rss.html
itemPubdate.text = curValue
if parsers_common.get(dataset["images"], i, printWarning=0):
# https://cyber.harvard.edu/rss/rss.html
# <enclosure url="http://www.scripting.com/mp3s/weatherReportSuite.mp3" length="12216320" type="audio/mpeg" />
curValue = dataset["images"][i]
encType = ""
curValue = curValue.strip()
if curValue.find("url(") > 0:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") korrastame parsimata meedialingi:': " + curValue, 1)
curValue = curValue.split("url('")[-1].strip("');")
encType = "image/jpeg"
if curValue.startswith("//"):
rss_print.print_debug(__file__, "kande(" + curKandeString + ") lisame meedialingi algusesse 'http:': " + curValue, 2)
curValue = "http:" + curValue
curValue = curValue.replace("https", "http")
if len(curValue) < len(domainText + "1.jpg"):
rss_print.print_debug(__file__, "kande(" + curKandeString + ") ei lisa RSS-i meedialinki, kuna see on liiga lühike: '" + curValue + "'", 0)
else:
if not curValue.startswith("http"):
rss_print.print_debug(__file__, "kande(" + curKandeString + ") meedialingi algusest ei leitud 'http'-d: '" + curValue + "'", 1)
curValue = parsers_common.str_domain_url(domainText, curValue)
if curValue.rfind('http') > 0:
rss_print.print_debug(__file__, "kande(" + curKandeString + ") meedialingi keskelt leiti 'http': '" + curValue + "'", 0)
if not encType:
if (".jpg" in curValue) or (".jpeg" in curValue):
encType = "image/jpeg"
elif ".png" in curValue:
encType = "image/png"
elif ".mp3" in curValue:
encType = "audio/mpeg"
curValue = curValue.replace("&", "%26")
curValue = curValue.encode('ascii', 'xmlcharrefreplace')
etree.SubElement(item, "enclosure", {'url': curValue, 'type': encType})
if parsers_common.get(dataset["authors"], i, printWarning=0):
curValue = dataset["authors"][i]
if curValue.upper() != titleText.upper():
itemAuthor = etree.SubElement(item, "author")
itemAuthorName = etree.SubElement(itemAuthor, "name")
itemAuthorName.text = curValue.encode('ascii', 'xmlcharrefreplace')
ret = etree.ElementTree(root)
return ret