This repository has been archived by the owner on Jul 28, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
initialisation.R
152 lines (143 loc) · 4.33 KB
/
initialisation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
initPackage <- function(lib) {
# Load packages, and install them if not installed yet.
#
# Args:
# lib: A Character vector with names of libraries to be loaded
#
# Returns:
# None
#
# Side-Effects:
# 1. Install required libraries if not installed yet
# 2. Load the library into Global Environment
# 3. Remove itself
tryCatch(
{
for (l in lib) {
eval(parse(text=paste0("library(",l,")")))
}
},
error = function(e) {
cat(l, 'not found...\nInstalling', l, 'now...\n')
install.packages(l)
initPackage(lib)
},
finally = cat('All packages loaded!')
)
on.exit(rm(initPackage, envir = .GlobalEnv))
}
initPhantomJS <- function() {
# Download PhantomJS if it's not there. Currently only supports Linux and macOS and there's no plan to support non-*nix system
#
# Args:
# None
# Returns:
# None
# Side-effects:
# 1. A file named phantomjs is stored in the working directory.
# 2. Remove itself
if (!file.exists('phantomjs')) {
if (Sys.info()['sysname'] == 'Linux') {
phJS_url <- 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2'
phJS_name <- 'phatomjs.tar.bz2'
phJS_bin <- './phantomjs-2.1.1-linux-x86_64/bin/phantomjs'
phJS_dir <- './phantomjs-2.1.1-linux-x86_64'
} else if (Sys.info()['sysname'] == 'Darwin') {
phJS_url <- 'https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-macosx.zip'
phJS_name <- 'phatomjs.zip'
phJS_bin <- './phantomjs-2.1.1-linux-macos/bin/phantomjs'
phJS_dir <- './phantomjs-2.1.1-linux-macos'
}
download.file(phJS_url, phJS_name)
untar(phJS_name)
file.copy(phJS_bin, getwd())
file.remove(phJS_name)
unlink(phJS_dir, TRUE)
}
on.exit(rm(initPhantomJS, envir = .GlobalEnv))
}
dlMenu <- function() {
# Download the meta-data containing the url to each individual peoms
#
# Args:
# None
# Returns:
# A data.table with url to the following categories of Tang poems:
# 1. Wulü, 5-character 8-line verse
# 2. Qilü, 7-character 8-line verse
# 3. Wujue, 5-character 4-line verse
# 4. Qijue, 7-character 4-line verse
# Side-effects:
# Remove it self
DTMenu <- data.table(type = character(),
author = character(),
title = character(),
url = character())
typeNames <- c('Wulü', 'Qilü', 'Wujue', 'Qijue')
for (i in 1:4) {
batch <-
read_html('https://zh.wikisource.org/wiki/唐詩三百首') %>%
html_nodes(xpath = glue('/html/body/div[3]/div[3]/div[4]/div/ol[',
i + 3,
']/li'))
type <- typeNames[i]
author <-
batch %>%
html_text() %>%
str_extract('.+?(?= )')
title <-
batch %>%
html_nodes('a') %>%
html_attr('title')
url <-
batch %>%
html_nodes('a') %>%
html_attr('href') %>%
map(function(x) glue('https://zh.wikisource.org', x)) %>%
unlist()
DTMenu <-
rbindlist(
list(DTMenu,
data.table(type = type,
author = author,
title = title,
url = url)))
}
on.exit(rm(dlMenu, envir = .GlobalEnv))
return(DTMenu)
}
dlPoems <- function (menu) {
# Dowload the Poems and store them as a html file, if any of them are not downloaded yet
#
# Args:
# menu A data.table with url and title for downloading
# Returns:
# None
# Side-effects:
# 1. Download all poems under WORKING_DIR/poems
# 2. Remove itself
if (!dir.exists('poems')) {
dir.create('poems')
}
if (!file.exists('scrape.js')) {
stop("scrape.js doesn't exist!")
}
# Maximum number of acts in menu
apply(
menu,
MARGIN = 1,
function(ro) {
author <- eval(parse(text=glue("ro['author']")))
title <- eval(parse(text=glue("ro['title']")))
url <- eval(parse(text=glue("ro['url']")))
dest <- glue('./poems/{file}', file = glue('{title}-{author}.html'))
if (!file.exists(dest)) {
cat(sprintf('Downloading %s from %s ...\n', title, url))
execute <- glue('./phantomjs scrape.js "{url}" "{dest}"')
system(execute)
}
}
)
cat('All files are downloaded under "poems" dir\n')
on.exit(rm(dlPoems, envir = .GlobalEnv))
}