forked from jakeseaton/aerocene
-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
239 lines (192 loc) · 7.06 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import os
import boto3
import time
from flask import Flask, jsonify, request
import settings
from datetime import datetime
import time
import json
import random
import queries
import functions
import instagram
'''
This file implements a flask server
that is deployed as a single lambda function.
The @app.route decorators map functions to
urls, so that when you run `sls wsgi serve`
You can go to localhost/<function> to
exectute that function
'''
app = Flask(__name__)
# index route
@app.route("/")
def hello(*args, **kwargs):
'''
Indicates that this Aerocene build is live
'''
return jsonify({ "status": 200, "message": "Welcome to Aerocene!" })
###
# Adversarial Server
# - exposes endpoints that emulate canonical methods of preventing scraping
# - rate limiting
# - blacklisting
# - exponential backoff
###
@app.route("/rate_limit")
def rate_limit(*args, **kwargs):
'''
Limit the number of inbound requests to a certain
amount per day.
'''
# extract the ip address from the request
address = request.remote_addr
print("Address", address)
# get or create a record for this address
record = queries.get_or_create_address(address)['Item']
# determine how many requests have been sent by this
# address
request_count = record.get("request_count")["N"]
# if that's greater than the max we allow per address
if int(request_count) >= settings.MAX_REQUESTS_PER_ADDRESS:
# bye felicia
return jsonify({
"status": 429,
"error": "Too Many Requests. Try again later."
})
# else increment the counter
queries.increment_requests_for_address(address)
# and return the updated record
return jsonify(queries.get_address(address).get("Item"))
@app.route("/backoff")
def backoff(*args, **kwargs):
'''
Implements exponential backoff by ip address.
After the maximum number of requests from
a particular address have been seen, sleeps
for an exponentially increasing amount of time
before returning
'''
address = request.remote_addr
print("BACKOFF Address", address)
record = queries.get_or_create_address(address).get("Item")
request_count = int(record.get("request_count").get("N"))
# if they've sent more requests than we allow per address
if int(request_count) >= settings.MAX_REQUESTS_PER_ADDRESS:
extra_requests = request_count - settings.MAX_REQUESTS_PER_ADDRESS
# backoff exponentially
time.sleep(2 ** extra_requests)
queries.increment_requests_for_address(address)
return jsonify(queries.get_address(address).get("Item"))
@app.route("/blacklist")
def blacklist(*args, **kwargs):
'''
Implements blacklisting
After the maximum number of requests
from a particular address have been seen,
mark the address as banned and never
respond to it again.
'''
address = request.remote_addr
print("Address", address)
record = queries.get_or_create_address(address).get("Item")
request_count = int(record.get("request_count").get("N"))
# if they've sent more requests than we allow per address
if int(request_count) >= settings.MAX_REQUESTS_PER_ADDRESS or record.get("blacklisted").get("BOOL"):
queries.blacklist_address(address)
return jsonify({ "status": 403, "error": "Forbidden", "blacklisted": True })
queries.increment_requests_for_address(address)
return jsonify(queries.get_address(address).get("Item"))
@app.route("/clear_address")
def clear_address(*args, **kwargs):
'''
For testing purposes, expose an endpoint
to clear the record of a client's ip address
'''
address = request.remote_addr
print("Clearing address", address)
try:
queries.delete_address(address)
return jsonify({ "status": 200, "message": "Successfully cleared %s" % address })
except Exception as e:
return jsonify({ "status": 500, "error": e })
###
# Bacth scraping mechanism. Enables a client
# to create scraping jobs.
###
@app.route('/batch_scrape', methods=['GET'])
def batch_scrape(*args, **kwargs):
'''
Endpoint for initializing batch scrapes.
When called, creates a record in dynamodby
that causes the CloudFormation to begin scraping
and updating that record.
'''
# extract arguments from query parameters on the url
location = request.args.get("location", settings.DEFAULT_LOCATION)
start_page = request.args.get("start_page", 0)
end_page = request.args.get("end_page", 1)
page_size = int(request.args.get("page_size", settings.PAGE_SIZE))
# create the scrape record
scrape_id = queries.create_scrape(start_page, end_page, location, page_size)
# return a json response that describes the created record
return jsonify({
"status": "success",
"message": "Batch scraping job was created successfully",
"scrape_id": scrape_id,
"location": location,
"start_page": start_page,
"end_page": end_page,
"page_size": page_size,
})
@app.route("/is_scrape_complete", methods=['GET'])
def is_scrape_complete(*args, **kwargs):
'''
Endpoint a waiting machine can query to determine if a scrape they initiated
has finished yet.
Queries the scrape in question, checks if it is done, and returns
a json continaing the result as a boolean.
'''
# extract scrape id from query parameters
scrape_id = request.args.get("scrape_id", "")
# handle error
if not scrape_id:
return jsonify({"error": "You must specify a scrape_id to check"})
# check if done and return a response
return jsonify({
"scrape_id": scrape_id,
"is_complete": functions.check_if_scrape_is_complete(scrape_id)
})
@app.route('/create_batch_scrape', methods=['POST'])
def batch_scrape_post(*args, **kwargs):
'''
POST request implementation of batch_scrape
'''
location = request.form.get("location", settings.DEFAULT_LOCATION)
start_page = request.form.get("start_page", 0)
end_page = request.form.get("end_page", 2)
page_size = int(equest.form.get("page_size", settings.PAGE_SIZE))
scrape_id = queries.create_scrape(start_page, end_page, location)
return jsonify({"scrape_id": scrape_id})
###
# Scraping functionality.
###
@app.route('/scrape_instagram', methods=['GET'])
def scrape_instagram(*args, **kwargs):
'''
Endpoint that can be queried to manually scrape instagram
at a particular location and cursor.
'''
# get the location and cursor from the get parameters
# of the request
location = request.args.get('location', settings.DEFAULT_LOCATION)
cursor = request.args.get('cursor', '')
page_size = int(request.args.get("page_size", settings.PAGE_SIZE))
# if we didn't receive either one
if not location and not cursor:
# return an error
return jsonify({'error': 'You must specify a location or an end cursor to scrape.'})
# use the instagram module to scrape on those parameters
# and return the result
result = instagram.scrape({'location': location, 'cursor': cursor}, {}, page_size)
return jsonify(result)