From fbfd07e149837608b20fa8087524c6c9555d0b53 Mon Sep 17 00:00:00 2001 From: Christopher Kittel Date: Mon, 16 May 2016 13:22:59 +0200 Subject: [PATCH 1/7] added CONTRIBUTING.md --- CONTRIBUTING.md | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..8824c75 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,58 @@ +# Contributing to getpapers + +Thank you for taking the time to contribute! :+1: + +This is a set of guidelines for contributing to quickscrape. You don't need to follow them as strict rules, use your best judgement and feel free to propose changes to this document as well via a pull request. + +#### Table of Contents + +[Basics](#basics) + +[How can I contribute?](#how-can-i-contribute) + +[Local testing](#local-testing) + +## Basics + +getpapers is based on Node.js. If you want an introduction on how to work on a project like this, you can find a comprehensive tutorial [here](http://www.nodebeginner.org/). + +## How can I contribute? + +### Report bugs + +If you encounter a bug, please let us know. You can raise a new issue [here](https://github.com/ContentMine/quickscrape/issues). Please include as many information in your report as possible, to help maintainers reproduce the problem. + +* A clear and descriptive title +* Describe the exact steps which reproduce the problem, e.g. the query you entered. +* Describe the behaviour following those steps, and where the problem occurred. +* Explain where it was different from what you expected to happen. +* Attach additional information to the report, such as error messages, or corrupted files. +* Add a `bug` label to the issue. + +Before submitting a bug, please check the [list of existing bugs](https://github.com/ContentMine/quickscrape/issues?q=is%3Aopen+is%3Aissue+label%3Abug) whether there is a similar issue open. You can then help by adding your information to an existing report. + +### Fixing bugs or implementing new features + +If you're not sure where to start, have a look at issues that have a `help wanted` label - here is a [list](https://github.com/ContentMine/quickscrape/issues?utf8=%E2%9C%93&q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22+). + +### Suggesting features or changes + +There is always room for improvement and we'd like to hear your perspective on it. + +Before creating a pull request, please raise an issue to discuss the proposed changes first. We can then make sure to make best use of your efforts. + +## Local testing + +In order to set up your development environment for getpapers, you need to install [Node.js](https://nodejs.org/en/). + +1. Create a fork on [github](https://help.github.com/articles/fork-a-repo/). + +1. Create a [new branch](https://www.atlassian.com/git/tutorials/using-branches/git-checkout) with a descriptive name. + +1. Work on your changes, and make regular commits to save them. + +1. Test your changes by running `npm install` within the repository and running gepapers with `npm bin/quickscrape.js`. + +1. When your changes work as intended, push them to your repository and [create a pull request](https://www.atlassian.com/git/tutorials/making-a-pull-request). + +1. We will then review the pull request and merge it as soon as possible. If problems arise, they will be discussed within the pull request. From 8198895596f848dd72bc370ec34fa1c449096c68 Mon Sep 17 00:00:00 2001 From: Christopher Kittel Date: Mon, 16 May 2016 13:30:41 +0200 Subject: [PATCH 2/7] added CONTRIBUTING.md --- CONTRIBUTING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8824c75..1d43b56 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -# Contributing to getpapers +# Contributing to quickscrape Thank you for taking the time to contribute! :+1: @@ -14,7 +14,7 @@ This is a set of guidelines for contributing to quickscrape. You don't need to f ## Basics -getpapers is based on Node.js. If you want an introduction on how to work on a project like this, you can find a comprehensive tutorial [here](http://www.nodebeginner.org/). +quickscrape is based on Node.js. If you want an introduction on how to work on a project like this, you can find a comprehensive tutorial [here](http://www.nodebeginner.org/). ## How can I contribute? @@ -43,7 +43,7 @@ Before creating a pull request, please raise an issue to discuss the proposed ch ## Local testing -In order to set up your development environment for getpapers, you need to install [Node.js](https://nodejs.org/en/). +In order to set up your development environment for quickscrape, you need to install [Node.js](https://nodejs.org/en/). 1. Create a fork on [github](https://help.github.com/articles/fork-a-repo/). From e1d22b1b5ed5c157e8a95d22de987c839155355d Mon Sep 17 00:00:00 2001 From: larsgw Date: Sun, 19 Jun 2016 19:31:07 +0200 Subject: [PATCH 3/7] Added missing comma --- bin/quickscrape.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/quickscrape.js b/bin/quickscrape.js index bc80d22..8a36154 100755 --- a/bin/quickscrape.js +++ b/bin/quickscrape.js @@ -35,7 +35,7 @@ program 'use a number instead of the URL to name output subdirectories') .option('-i, --ratelimit ', 'maximum number of scrapes per minute (default 3)', 3) - .option('-h --headless', + .option('-h, --headless', 'render all pages in a headless browser') .option('-l, --loglevel ', 'amount of information to log ' + From 0be76de700c708336acefeefd56f0d6e4515938b Mon Sep 17 00:00:00 2001 From: Thomas Arrow Date: Mon, 20 Jun 2016 09:34:51 +0100 Subject: [PATCH 4/7] sanitize creation of folder --- bin/quickscrape.js | 5 +++-- package.json | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/quickscrape.js b/bin/quickscrape.js index bc80d22..1b913fc 100755 --- a/bin/quickscrape.js +++ b/bin/quickscrape.js @@ -11,7 +11,8 @@ var program = require('commander') , Scraper = thresher.Scraper , ep = require('../lib/eventparse.js') , loglevels = require('../lib/loglevels.js') - , outformat = require('../lib/outformat.js'); + , outformat = require('../lib/outformat.js') + , sanitize = require('sanitize-filename'); var pjson = require('../package.json'); @@ -223,7 +224,7 @@ var processUrl = function(url) { // url-specific output dir var dir = program.numberdirs ? ('' + i) : url.replace(/\/+/g, '_').replace(/:/g, ''); - dir = path.join(tld, dir); + dir = sanitize(path.join(tld, dir)); if (!fs.existsSync(dir)) { log.debug('creating output directory: ' + dir); fs.mkdirSync(dir); diff --git a/package.json b/package.json index e7626b3..6accc89 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,8 @@ "moment": "~2.10.2", "thresher": "^0.1.11", "which": "~1.0.5", - "winston": "~1.0.0" + "winston": "~1.0.0", + "sanitize-filename": "1.6.0" }, "bin": { "quickscrape": "bin/quickscrape.js" From b1348b99949198f7d170b370af1c44f9b5969eb6 Mon Sep 17 00:00:00 2001 From: petermr Date: Fri, 19 Aug 2016 11:37:31 +0100 Subject: [PATCH 5/7] Create BUGS.md --- BUGS.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 BUGS.md diff --git a/BUGS.md b/BUGS.md new file mode 100644 index 0000000..94fb682 --- /dev/null +++ b/BUGS.md @@ -0,0 +1,37 @@ +# bugs + +most bugs should be reported the the issues. However some are created by other pacakges such as Spooky and require +per-installation workarounds + +### quickscrape/tiny-jsonrpc bug + +The details will differ according to where `node` is installed. Here's PMR's: +``` +Error: Cannot find module '/usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/spooky/lib/../node_modules/tiny-jsonrpc/lib/tiny-jsonrpc' so moving on to next url in list +Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/casperjs/bin/bootstrap.js. Domains, protocols and ports must match. +/usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/eventemitter2/lib/eventemitter2.js:290 + throw arguments[1]; // Unhandled 'error' event + ^ + +Error: Child terminated with non-zero exit code 1 + at Spooky. (/usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/spooky/lib/spooky.js:210:17) + at emitTwo (events.js:106:13) + at ChildProcess.emit (events.js:191:7) + at Process.ChildProcess._handle.onexit (internal/child_process.js:204:12) +``` +find where your quickscrape is: +``` +which quickscrape +gives: +/usr/local/n/versions/node/6.2.1/bin/quickscrape +create the top level dir +/usr/local/n/versions/node/6.2.1/ +other might have +/home/$USER/.nvm/versions/node/v6.3.1 + +``` +then copy files from the `lib` directory (after adjusting) +``` +cd /usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/ +cp -r node_modules/tiny-jsonrpc node_modules/spooky/node_modules +``` From b1544694ba856b3c37c477da871076ba04bbd1be Mon Sep 17 00:00:00 2001 From: Thomas Arrow Date: Fri, 26 Aug 2016 12:44:18 +0100 Subject: [PATCH 6/7] outformat to use -g to avoid conflict and enable file logging with correct winston syntax --- bin/quickscrape.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/quickscrape.js b/bin/quickscrape.js index 0d06e50..197bcbd 100755 --- a/bin/quickscrape.js +++ b/bin/quickscrape.js @@ -42,7 +42,7 @@ program 'amount of information to log ' + '(silent, verbose, info*, data, warn, error, or debug)', 'info') - .option('-f, --outformat ', + .option('-g, --outformat ', 'JSON format to transform results into (currently only bibjson)') .option('-f, --logfile ', 'save log to specified file in output directory as well as printing to terminal') @@ -86,7 +86,7 @@ tld = process.cwd(); if (program.hasOwnProperty('logfile')) { log.add(winston.transports.File, { - filename: program.logfile, + stream: fs.createWriteStream(program.logfile.toString()), level: 'debug' }); log.info('Saving logs to ./' + program.output + '/' + program.logfile); From 4a59a6c068073b8f8b728d00d7f65aab164249dc Mon Sep 17 00:00:00 2001 From: Thomas Arrow Date: Wed, 31 Aug 2016 13:12:24 +0100 Subject: [PATCH 7/7] add logfile to variable before we change directory to first scrape --- bin/quickscrape.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/quickscrape.js b/bin/quickscrape.js index 197bcbd..3536886 100755 --- a/bin/quickscrape.js +++ b/bin/quickscrape.js @@ -85,8 +85,9 @@ process.chdir(program.output); tld = process.cwd(); if (program.hasOwnProperty('logfile')) { + var logfilestream = fs.createWriteStream(program.logfile.toString()) log.add(winston.transports.File, { - stream: fs.createWriteStream(program.logfile.toString()), + stream: logfilestream, level: 'debug' }); log.info('Saving logs to ./' + program.output + '/' + program.logfile);