-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.js
92 lines (71 loc) · 2.29 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
const { JSDOM } = require('jsdom');
const normalizeURL = (url) => {
const urlObject = new URL(url);
const { host } = urlObject;
const { pathname } = urlObject;
let normalizedURL = `${host}${pathname}`;
if(normalizedURL.length > 0 && normalizedURL[normalizedURL.length - 1] === '/') {
normalizedURL = normalizedURL.slice(0, normalizedURL.length - 1);
}
return normalizedURL;
};
const getURLsFromHTML = (htmlString, baseURL) => {
if(!baseURL) {
console.log('Please provide a baseURL');
return;
}
const dom = new JSDOM(htmlString);
const tags = Array.from(dom.window.document.querySelectorAll("a")).map(t => t.href);
const builtUrls = [];
for(let i = 0; i < tags.length; i++) {
if (tags[i][0] === '/') {
try {
builtUrls.push(new URL(tags[i], baseURL).href)
} catch (e){
console.log(`${e.message}:: ${tags[i]}`)
}
} else {
try {
builtUrls.push(new URL(tags[i]).href)
} catch (e) {
console.log(`${e.message}: ${tags[i]}`)
}
}
}
return builtUrls;
};
const crawlPage = async (baseURL, currentURL, pages) => {
const current = new URL(currentURL).host;
const base = new URL(baseURL).host;
if(current !== base) {
return pages;
}
const normalizedURL = normalizeURL(currentURL);
if (pages[normalizedURL] > 0) {
pages[normalizedURL]++;
return pages;
}
pages[normalizedURL] = 1;
let htmlBody = '';
try {
const res = await fetch(currentURL);
if(res.status >= 400) {
console.log(`Got HTTP error, status code: ${res.status}`);
return pages;
}
const contentType = res.headers.get("Content-Type");
if(!contentType.includes('text/html')) {
console.log('Probably not a valid webpage...')
return pages;
}
htmlBody = await res.text();
} catch (e) {
console.log(`${e.message}`)
}
const urlArray = getURLsFromHTML(htmlBody, baseURL);
for(const nextUrl of urlArray) {
pages = await crawlPage(baseURL, nextUrl, pages);
}
return pages;
};
module.exports = { normalizeURL, getURLsFromHTML, crawlPage };