-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
local-crawler.js
263 lines (241 loc) · 9.26 KB
/
local-crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
const { Builder } = require("selenium-webdriver");
const firefox = require("selenium-webdriver/firefox");
const fs = require("fs");
const { parse } = require("csv-parse");
const axios = require("axios");
var total_begin = Date.now(); //start logging time
var err_obj = new Object();
// Loads sites to crawl
const sites = [];
fs.createReadStream("sites1.csv")
.pipe(parse({ delimiter: ",", from_line: 2 }))
.on("data", function (row) {
sites.push(row[0]);
})
.on("error", function (error) {
console.log(error.message);
});
var options;
let driver;
// write a custom error
// we throw this the title of the site has a human check
// then we can identify sites that we can't crawl with the vpn on
class HumanCheckError extends Error {
constructor(message) {
super(message);
this.name = "HumanCheckError";
}
}
async function setup() {
await new Promise((resolve) => setTimeout(resolve, 3000));
options = new firefox.Options()
.setBinary(firefox.Channel.NIGHTLY)
.setBinary('/Applications/Firefox\ Nightly.app/Contents/MacOS/firefox')
.setPreference("xpinstall.signatures.required", false)
.addExtensions("./myextension.xpi");
options.addArguments("--headful");
options.addArguments("disable-infobars")
options.addArguments('--no-sandbox')
options.addArguments('--disable-application-cache')
options.addArguments('--disable-gpu')
options.addArguments("--disable-dev-shm-usage")
driver = new Builder()
.forBrowser("firefox")
.setFirefoxOptions(options)
.build();
// set timeout so that if a page doesn't load in 30 s, it times out
await driver
.manage()
.setTimeouts({ implicit: 0, pageLoad: 35000, script: 30000 });
console.log("built");
// await driver.manage().window().maximize();
await new Promise((resolve) => setTimeout(resolve, 3000));
console.log("setup complete");
}
async function put_site_id(data) {
try {
var response = await axios.put(`http://localhost:8080/analysis`, data);
} catch (error) {
console.error(error);
}
}
async function check_update_DB(site, site_id) {
st = site.replace("https://www.", ""); // keep only the domain part of the url -- this only works if site is of this form
st = st.replace("https://", ""); // removes https:// if www. isn't in the link
// dealing with sites that have additional paths (only keep the part before the path)
split = st.split('/');
site_str = split[0];
// https://www.npmjs.com/package//axios?activeTab=readme --axios with async
// console.log(site_str);
var added = false;
try {
// after a site is visited, to see if the data was added to the db
var response = await axios.get(
`http://localhost:8080/analysis/${site_str}`
);
latest_res_data = response.data;
// console.log("getting: ", site_str, "-->", latest_res_data);
if (latest_res_data.length >= 1) {
// console.log(latest_res_data[latest_res_data.length - 1]);
//it exists -> get last added and make sure that id is null (ie it just got added)
if (latest_res_data[latest_res_data.length - 1]["site_id"] == null) {
//update site_id
latest_res_data[latest_res_data.length - 1]["site_id"] = site_id;
// do put request to update site_id
await put_site_id(latest_res_data[latest_res_data.length - 1]);
added = true;
}
} else {
var res = await axios.get(`http://localhost:8080/null_analysis`);
latest_res_data = res.data;
console.log("null site_id: ", latest_res_data);
if (latest_res_data.length >= 1) {
latest_res_data[latest_res_data.length - 1]["site_id"] = site_id;
// do put request
await put_site_id(latest_res_data[latest_res_data.length - 1]);
added = true;
}
}
} catch (error) {
console.error(error.message);
latest_res_data = undefined; // make the crawl fail since the rest-api probably exited--may be diferent if api is not local
}
return added;
}
async function visit_site(sites, site_id) {
var error_value = "no_error";
console.log(site_id, ": ", sites[site_id]);
try {
await driver.get(sites[site_id]);
// console.log(Date.now()); to compare to site loading time in debug table
await new Promise((resolve) => setTimeout(resolve, 22000));
// await new Promise((resolve) => setTimeout(resolve, 80000)); // for ground truth collection
// check if access is denied
// if so, throw an error so it gets tagged as a human check site
var title = await driver.getTitle();
if (
(title.match(/Access/i) && title.match(/Denied/i)) ||
title.match(/error/i) ||
(title.match(/service/i) && title.match(/unavailable/i)) ||
title.match(/Just a moment.../i) ||
title.match(/you have been blocked/i) ||
title.match(/site not available/i) ||
title.match(/attention required/i) ||
title.match(/access to this page has been blocked/i) ||
(title.match(/site/i) && title.match(/temporarily unavailable/i)) ||
(title.match(/site/i) && title.match(/temporarily down/i)) ||
title.match(/403 forbidden/i) ||
title.match(/pardon our interruption/i) ||
title.match(/robot or human/i) ||
title.match(/are you a robot/i) ||
title.match(/block -/i) ||
title.match(/Human Verification/i)
) {
throw new HumanCheckError("Human Check");
}
} catch (e) {
console.log(e);
var msg = "";
// we want to separate the reaching an error page from other webdriver errors
if (e.message.match(/reached error page/i)) {
msg = ": Reached Error Page";
}
// log the errors in an object so you don't have to sort through manually
if (e.name + msg in err_obj) {
err_obj[e.name + msg].push(sites[site_id]);
} else {
err_obj[e.name + msg] = [sites[site_id]];
}
console.log(err_obj);
error_value = e.name; // update error value
///////////////
// converting the JSON object to a string
var err_data = JSON.stringify(err_obj);
// writing the JSON string content to a file
fs.writeFile("./error-logging/error-logging.json", err_data, (error) => {
// throwing the error
// in case of a writing problem
if (error) {
// logging the error
console.error(error);
throw error;
}
console.log("error-logging.json written correctly");
});
//////////////////////
// if it's just a human check site, we don't need to restart
if (e.name != "HumanCheckError") {
if (e.message.match(/Failed to decode response from marionette/i)) {
console.log(e.name + ': ' + e.message + "-- driver should already have quit ");
}
else {
// take a screenshot of the page so we can better understand what was going on
try {
driver.takeScreenshot().then(function (data) {
var base64Data = data.replace(/^data:image\/png;base64,/, "")
var st = sites[site_id].replace("https://www.", ""); // keep only the domain part of the url -- this only works if site is of this form
st = st.replace("https://", ""); // removes https:// if www. isn't in the link
var filename = './error-logging/' + st + ".png"
fs.writeFile(filename, base64Data, 'base64', function (err) {
if (err) console.log(err);
});
});
}
catch (screenshot_err) { console.log('screenshot failed') }
driver.quit();
}
console.log("------restarting driver------");
new Promise((resolve) => setTimeout(resolve, 10000));
await setup(); //restart the selenium driver
}
}
return error_value;
}
async function putReq_and_checkRedo(sites, site_id, error_value) {
// check the db to see if prev site was added / add the site_id
var added = await check_update_DB(sites[site_id], site_id);
if (
//determine whether to redo the site--redo if it wasn't added and there was not
//an error that prevents us from analyzing that site
added == false &&
error_value != "InsecureCertificateError" &&
error_value != "WebDriverError" &&
error_value != "HumanCheckError"
) {
console.log("redo prev site");
await visit_site(sites, site_id);
await new Promise((resolve) => setTimeout(resolve, 2000));
// putting site id on redo site
added = await check_update_DB(sites[site_id], site_id);
}
}
(async () => {
await setup();
var error_value = "no_error";
for (let site_id in sites) {
var begin_site = Date.now(); // for timing
await new Promise((resolve) => setTimeout(resolve, 2000));
if (site_id > 0) {
// check if previous site was added
// if so, do the put request accordingly
// if not, see if we need to redo it
await putReq_and_checkRedo(sites, site_id - 1, error_value);
}
error_value = await visit_site(sites, site_id);
//just for the last entry--inc timeout to make sure it is input before checking
if (site_id == sites.length - 1) {
// give it extra time for site to be added to db
await new Promise((resolve) => setTimeout(resolve, 2000));
await putReq_and_checkRedo(sites, site_id, error_value);
}
var end_site = Date.now();
var timeSpent_site = (end_site - begin_site) / 1000;
console.log(
"time spent: ",
timeSpent_site,
"total elapsed: ",
(end_site - total_begin) / 1000
);
}
driver.quit();
})();