Skip to content

Commit 5fc6f33

Browse files
committed
version 2.0.0 features added
1 parent 9e5d766 commit 5fc6f33

File tree

6 files changed

+65
-28
lines changed

6 files changed

+65
-28
lines changed

UPDATE.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# V 2.0.0
2+
3+
- ## added dynamicSchemas config option
4+
web-crawljs now has dynamicSchemas that allow you change the structure of nextSelector, nextSelectBy,
5+
fetchSelector and fetchSelectBy if the url matches a url yu specify.
6+
7+
- ## changed loop and loopFn to depth and depthFn
8+
The loop and loopFn have been renamed to depth and depthFn.
9+
10+
- ## fixed issue with finalFn
11+
The issue with finalFn in the formal version is now fixed.
12+
13+
- ## added formatUrl config option
14+
The function format the next url to crawl. It takes a url and returns a new url. this url
15+
can be a string or an Object that is supported by the request module for making requests.
16+

module/crawl.js

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@ function createCrawler(config = {}) {
88

99
let urls, finalFn, depthFn, depth;
1010

11+
//
1112
function defaultLoopFn(data) {
12-
console.log("end of each loop")
13+
console.log("---depth---")
1314
}
1415

1516
function defaultFinalFn() {
16-
console.log('end...')
17+
console.log('---final---')
1718
}
1819

1920

@@ -28,26 +29,30 @@ function createCrawler(config = {}) {
2829
nextLinks = nextLinks.concat(urls);
2930
})(config);
3031

32+
/**
33+
* @description
34+
*/
3135
function crawl() {
32-
try {
36+
3337
crawlUrls(nextLinks, config)
3438
.then(scrapedData => {
3539
depthFn(scrapedData.fetchedData);
3640
gen.next(scrapedData.nextLinks);
3741
})
3842
.catch(err => {
39-
console.error(err)
40-
})
41-
}catch (err){
42-
console.log(err)
43-
}
43+
gen.next({err})
44+
});
45+
4446
}
4547

4648

4749
function* crawlGen() {
4850
for (let i = 0; i < depth; i++) {
4951
nextLinks = yield crawl();
50-
// console.log(nextLinks)
52+
if(nextLinks.err){
53+
console.error(nextLinks.err);
54+
break;
55+
}
5156
}
5257
finalFn()
5358
}

module/crawlUrls.js

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,18 @@ function crawlUrl(urls, resolve) {
5252
function req(url) {
5353

5454
request(url, function (err, response, body) {
55-
if (err) return console.log(err.message);
5655
visitedUrls--;
57-
getDomContents = dom(body).getDomContents; //
58-
scrapedData.push(fetchFromPage(url));
59-
let newLink = _.uniq(util.sortDataToArray([selectNextCrawlContent(url)])).map(url => {
60-
"use strict";
61-
return formatUrl(url)
62-
});
63-
initialLink = initialLink.concat(newLink);
56+
if (err) {
57+
console.error(err.message);
58+
} else {
59+
getDomContents = dom(body).getDomContents; //
60+
scrapedData.push(fetchFromPage(url));
61+
let newLink = _.uniq(util.sortDataToArray([selectNextCrawlContent(url)])).map(url => {
62+
"use strict";
63+
return formatUrl(url)
64+
});
65+
initialLink = initialLink.concat(newLink);
66+
}
6467

6568
if (visitedUrls == 0) {
6669
resolve({fetchedData: scrapedData, nextLinks: initialLink})

module/dom.js

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,14 @@ module.exports = function (content) {
88
//errors
99
const KEY_ERROR = "the keys don't match. Make sure keys in arg1 matches keys in arg2";
1010

11-
11+
/**
12+
* @description returns back the Dom content from the page's body
13+
* @param selector
14+
* @param selectBy
15+
* @param callback
16+
* @param url
17+
* @return {*}
18+
*/
1219
function getDomContents(selector, selectBy, callback, url) {
1320

1421
let $ = cheerio.load(content);
@@ -57,7 +64,7 @@ module.exports = function (content) {
5764

5865

5966
/**
60-
*
67+
* @description extracts the data by the name given to the seector Object keys.
6168
* @param selector
6269
* @param selectBy
6370
* @param $

module/util.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ function sortDataToArray(data) {
6262
}, [])
6363
}
6464

65-
//Todo: add what will reformat the urls, the selectBy e.t.c.
6665
/**
6766
* @description returns a url; the url can either be a string or an Object supported by request package
6867
* @param {String} url

test/crawl_test.js

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
let expect = require('chai').expect;
22
let Crawler = require('../index')({
3-
fetchSelector: {title: "title", body: "div#mw-content-text"},
4-
fetchSelectBy: {title: 'text', body: "text"},
3+
fetchSelector: {title: "title"},
4+
fetchSelectBy: {title: 'text'},
55
nextSelector: {links: 'a[href^="/"]'},
66
nextSelectBy: {links: ['attr', 'href']},
77
fetchFn: (err, data, url) => {
88
if (err) console.error(err.message);
9-
if (url == 'http://localhost/dashboard/') console.log('saving somewhere', data);
10-
// console.log(data.title[0],url)
11-
// require('fs').writeFile('./data.txt',data.body[0],(err,data)=>console.log('data written to data.txt'))
9+
if (/https:\/\/en.wiki/.test(url)) {
10+
let json = JSON.stringify(data);
11+
require('fs').writeFile('./data.json', json, (err, data) => console.log('data written to data.txt'))
12+
} else {
13+
return console.log(data);
14+
}
15+
1216
},
1317
formatUrl: function (url) {
1418
if (url == 'http://localhost/dashboard/faq.html') {
@@ -21,14 +25,17 @@ let Crawler = require('../index')({
2125
// console.log(data,url)
2226
},
2327
dynamicSchemas: {
24-
fetchSelector: [{url: /http:\/\/localhost\//, schema: {title: "title"}}],
25-
fetchSelectBy: [{url: /http:\/\/localhost\//, schema: {title: "text"}}],
28+
//when the url matches https://en.wikipedia.org it uses this schema to format it
29+
fetchSelector: [{url: /https:\/\/en.wikipedia.org/, schema: {title: "title", body: "div#mw-content-text"}}],
30+
fetchSelectBy: [{url: /https:\/\/en.wikipedia.org/, schema: {title: "text",body: "text"}}],
31+
nextSelector: [{url: /https:\/\/en.wikipedia/, schema: {link: ""}}],
32+
nextSelectBy: [{url: /https:\/\/en.wikipedia/, schema: {link: ""}}],
2633
},
2734
depthFn: function (data) {
2835
// console.log(data)
2936
},
3037
depth: 2,
31-
urls: ['http://localhost/dashboard/']
38+
urls: ['http://localhost/dashboard/' /*'https://en.wikipedia.org/wiki/Web_crawler'*/]
3239
});
3340

3441

0 commit comments

Comments
 (0)