forked from CS3219-AY2223S1/cs3219-project-ay2223s1-g54
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
164 lines (141 loc) · 5.38 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import requests
import json
class Crawler:
BASE_URL = "https://leetcode.com"
GRAPHQL_URL = f"{BASE_URL}/graphql"
ALGO_INTERNAL_URL = f"{BASE_URL}/api/problems/algorithms/"
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15"
def __init__(self):
self.session = requests.Session()
response = self.session.get(self.BASE_URL)
for cookie in response.cookies:
if cookie.name == "csrftoken":
self.csrftoken = cookie.value
return
raise Exception("Unable to retrieve CSRF token")
def fetch_all_question_title_slugs(self):
response = self.session.get(self.ALGO_INTERNAL_URL)
response_json = response.json()
question_title_slugs = []
for entry in response_json["stat_status_pairs"]:
question_title_slugs.append(entry["stat"]["question__title_slug"])
return question_title_slugs
def fetch_question(self, question_title_slug):
headers = {
"Connection": "keep-alive",
"Content-Type": "application/json",
"User-Agent": self.USER_AGENT,
"Referer": f"https://leetcode.com/problems/{question_title_slug}"
}
payload = {
"operationName": "questionData",
"variables": {
"titleSlug": question_title_slug
},
"query": '''query questionData($titleSlug: String!) {
question(titleSlug: $titleSlug) {
questionId
questionFrontendId
boundTopicId
title
titleSlug
content
translatedTitle
translatedContent
isPaidOnly
difficulty
likes
dislikes
isLiked
similarQuestions
exampleTestcases
categoryTitle
contributors {
username
profileUrl
avatarUrl
__typename
}
topicTags {
name
slug
translatedName
__typename
}
companyTagStats
codeSnippets {
lang
langSlug
code
__typename
}
stats
hints
solution {
id
canSeeDetail
paidOnly
hasVideoSolution
paidOnlyVideo
__typename
}
status
sampleTestCase
metaData
judgerAvailable
judgeType
mysqlSchemas
enableRunCode
enableTestMode
enableDebugger
envInfo
libraryUrl
adminUrl
challengeQuestion {
id
date
incompleteChallengeCount
streakCount
type
__typename
}
__typename
}
}'''
}
data = json.dumps(payload).encode("utf8")
response = self.session.post(self.GRAPHQL_URL, data=data, headers=headers, timeout=10)
if response.status_code != 200:
raise Exception(f"Unable to fetch {question_title_slug} content")
response_json = response.json()
question_object = response_json["data"]["question"]
if question_object["isPaidOnly"]:
return None
topicTags = []
for topicTag in question_object["topicTags"]:
topicTags.append({ "name": topicTag["name"], "slug": topicTag["slug"] })
codeSnippets = []
for codeSnippet in question_object["codeSnippets"]:
codeSnippets.append({ "name": codeSnippet["lang"], "slug": codeSnippet["langSlug"], "code": codeSnippet["code"] })
return {
"id": question_object["questionId"],
"title": question_object["title"],
"difficulty": question_object["difficulty"],
"topicTags": topicTags,
"content": question_object["content"],
"codeSnippets": codeSnippets
}
def fetch_all_questions(self):
question_objects = []
question_title_slugs = self.fetch_all_question_title_slugs()
for question_title_slug in question_title_slugs:
question_object = self.fetch_question(question_title_slug)
if question_object:
print(f"Fetched {question_object['title']}.")
question_objects.append(question_object)
return question_objects
if __name__ == "__main__":
crawler = Crawler()
question_objects = crawler.fetch_all_questions()
file = open("data.json", "w")
file.write(json.dumps(question_objects))