From e7c2be520964fd93ac94534f6eb979e178fef38e Mon Sep 17 00:00:00 2001 From: ayushjainrksh Date: Sat, 30 Oct 2021 00:45:26 +0530 Subject: [PATCH] Refactor bits and pieces --- src/constants/index.js | 7 ++++++ src/constants/selectors.js | 22 +++++++++++++++++ src/constants/urls.js | 17 +++++++++++++ src/scripts/scrapeLinkedin.js | 40 +++++++++++++------------------ src/utils/formatters.js | 7 ++++++ src/utils/index.js | 9 +++++++ src/utils/{scoll.js => scroll.js} | 0 7 files changed, 78 insertions(+), 24 deletions(-) create mode 100644 src/constants/urls.js create mode 100644 src/utils/formatters.js rename src/utils/{scoll.js => scroll.js} (100%) diff --git a/src/constants/index.js b/src/constants/index.js index e69de29..4bdcf89 100644 --- a/src/constants/index.js +++ b/src/constants/index.js @@ -0,0 +1,7 @@ +const selectors = require("./selectors"); +const urls = require("./urls"); + +module.exports = { + selectors, + urls, +}; diff --git a/src/constants/selectors.js b/src/constants/selectors.js index e69de29..5cf6d4c 100644 --- a/src/constants/selectors.js +++ b/src/constants/selectors.js @@ -0,0 +1,22 @@ +const VISIT_ALL_EMPLOYEES = + "a.ember-view.org-top-card-secondary-content__see-all-link"; + +const PROFILE_LINK_SELECTORS = [ + ".search-result__info .search-result__result-link", + ".reusable-search__entity-results-list .entity-result__title-text a", +]; + +const BOTTOM_NAV_NEXT_BUTTON = + ".artdeco-pagination__button.artdeco-pagination__button--next"; + +const TIME_SELECTOR = + "div.feed-shared-actor__meta.relative >" + + " span.feed-shared-actor__sub-description.t-12.t-black--light.t-normal" + + " > span > span.visually-hidden"; + +module.exports = { + VISIT_ALL_EMPLOYEES, + PROFILE_LINK_SELECTORS, + BOTTOM_NAV_NEXT_BUTTON, + TIME_SELECTOR, +}; diff --git a/src/constants/urls.js b/src/constants/urls.js new file mode 100644 index 0000000..03d5dcc --- /dev/null +++ b/src/constants/urls.js @@ -0,0 +1,17 @@ +// Constants +const LINKEDIN_URL = "https://www.linkedin.com"; + +// Functions +const companyProfileUrl = (companyName) => { + return `${LINKEDIN_URL}/company/${companyName}`; +}; + +const userProfileUrl = (profileLink) => { + return `${profileLink}/detail/recent-activity`; +}; + +module.exports = { + LINKEDIN_URL, + companyProfileUrl, + userProfileUrl, +}; diff --git a/src/scripts/scrapeLinkedin.js b/src/scripts/scrapeLinkedin.js index 2881cf5..04cb0a7 100644 --- a/src/scripts/scrapeLinkedin.js +++ b/src/scripts/scrapeLinkedin.js @@ -2,9 +2,9 @@ const puppeteer = require("puppeteer"); const fs = require("fs"); const rxjs = require("rxjs"); const { mergeMap, toArray, filter } = require("rxjs/operators"); -const { saveProfiles } = require("../utils/fileIO"); const { linkedinLogin } = require("./login"); -const { autoScroll } = require("../utils/scoll"); +const { urls, selectors } = require("../constants"); +const { fileIO, formatters, scroll } = require("../utils"); /** * Fetch all profile links @@ -14,16 +14,13 @@ const { autoScroll } = require("../utils/scoll"); const fetchProfileLinks = async (page, pagesToVisit = 2) => { let profileLinks = []; for (let pageNumber = 0; pageNumber < pagesToVisit; pageNumber++) { - await autoScroll(page); + await scroll.autoScroll(page); //Fetch all profile links from the page profileLinks.push( ...(await page.evaluate(() => { //Multiple selectors for different displays of LinkedIn(see issue #20) - const profileListSelectors = [ - ".search-result__info .search-result__result-link", - ".reusable-search__entity-results-list .entity-result__title-text a", - ]; + const profileListSelectors = selectors.PROFILE_LINK_SELECTORS; let profileListNodes = null; for ( let profileListSelectorIndex = 0; @@ -48,7 +45,7 @@ const fetchProfileLinks = async (page, pagesToVisit = 2) => { profileListNodes.forEach((profile) => { if (profile.href) { // Remove query params from URL - profiles.push(profile.href.split("?")[0]); + profiles.push(formatters.removeQueryParamsFromUrl(profile.href)); } }); return profiles; @@ -58,9 +55,7 @@ const fetchProfileLinks = async (page, pagesToVisit = 2) => { if (pageNumber < pagesToVisit - 1) { //Click on next button on the bottom of the profiles page - await page.click( - ".artdeco-pagination__button.artdeco-pagination__button--next" - ); + await page.click(selectors.BOTTOM_NAV_NEXT_BUTTON); await page.waitForNavigation(); } } @@ -84,19 +79,15 @@ const fetchEachProfileActivityInParallel = async ( return rxjs.from(profileLinks).pipe( mergeMap(async (profileLink) => { //Visit activity page - await page.goto(profileLink + "/detail/recent-activity", { + await page.goto(urls.userProfileUrl(profileLink), { waitUntil: waitUntilOptions, }); //Find time of last activities of a user(likes, comments, posts) const individualActivities = await page.evaluate(() => { let timeOfActivity = []; - const timeSelector = - "div.feed-shared-actor__meta.relative >" + - " span.feed-shared-actor__sub-description.t-12.t-black--light.t-normal" + - " > span > span.visually-hidden"; - if (document.querySelectorAll(timeSelector)) { - document.querySelectorAll(timeSelector).forEach((item) => { + if (document.querySelectorAll(selectors.TIME_SELECTOR)) { + document.querySelectorAll(selectors.TIME_SELECTOR).forEach((item) => { if (item.innerHTML) { //Log all user activity within a week if ( @@ -156,7 +147,7 @@ const scrapeLinkedIn = async (data) => { } } else { //Visit LinkedIn - await page.goto(`https://www.linkedin.com/`); + await page.goto(urls.LINKEDIN_URL); //Login to your account await linkedinLogin(data.username, data.password, page); @@ -164,14 +155,13 @@ const scrapeLinkedIn = async (data) => { try { //Visit the company's page and find the list of employees - await page.goto(`https://www.linkedin.com/company/${data.company}`, { + console.log(urls); + await page.goto(urls.companyProfileUrl(data.company), { waitUntil: waitUntilOptions, }); //Visit all employees from the company's page - await page.click( - "a.ember-view.org-top-card-secondary-content__see-all-link" - ); + await page.click(selectors.VISIT_ALL_EMPLOYEES); } catch (e) { console.error( "Oops! An error occured while trying to find the company's page." + @@ -198,7 +188,7 @@ const scrapeLinkedIn = async (data) => { console.log("Active users : ", activeEmployees); //Save profiles to a file - saveProfiles(activeEmployees); + fileIO.saveProfiles(activeEmployees); await browser.close(); } catch (err) { @@ -209,5 +199,7 @@ const scrapeLinkedIn = async (data) => { }; module.exports = { + fetchProfileLinks, + fetchEachProfileActivityInParallel, scrapeLinkedIn, }; diff --git a/src/utils/formatters.js b/src/utils/formatters.js new file mode 100644 index 0000000..2e80c98 --- /dev/null +++ b/src/utils/formatters.js @@ -0,0 +1,7 @@ +const removeQueryParamsFromUrl = (url) => { + return url.split("?")[0]; +}; + +module.exports = { + removeQueryParamsFromUrl, +}; diff --git a/src/utils/index.js b/src/utils/index.js index e69de29..30f4a4d 100644 --- a/src/utils/index.js +++ b/src/utils/index.js @@ -0,0 +1,9 @@ +const fileIO = require("./fileIO"); +const formatters = require("./formatters"); +const scroll = require("./scroll"); + +module.exports = { + fileIO, + formatters, + scroll, +}; diff --git a/src/utils/scoll.js b/src/utils/scroll.js similarity index 100% rename from src/utils/scoll.js rename to src/utils/scroll.js