|
1 | | -import path from 'node:path' |
2 | 1 | import { createCrawl, createCrawlOpenAI } from 'x-crawl' |
3 | | -import { fileURLToPath } from 'node:url' |
4 | 2 |
|
5 | 3 | import { BASE_URL, API_KEY } from './envConfig' |
| 4 | +import { fileURLToPath } from 'url' |
6 | 5 |
|
7 | | -const pathResolve = (dirPath: string) => |
8 | | - fileURLToPath(new URL(dirPath, import.meta.url)) |
| 6 | +const pathResolve = (dir: string) => |
| 7 | + fileURLToPath(new URL(dir, import.meta.url)) |
9 | 8 |
|
10 | | -const crawlOpenAIApp = createCrawlOpenAI({ |
11 | | - clientOptions: { baseURL: BASE_URL, apiKey: API_KEY } |
| 9 | +const crawlApp = createCrawl({ |
| 10 | + maxRetry: 3, |
| 11 | + intervalTime: { max: 2000, min: 1000 } |
12 | 12 | }) |
13 | 13 |
|
14 | | -const crawlApp = createCrawl({ |
15 | | - crawlPage: { puppeteerLaunchOptions: { headless: true } } |
| 14 | +const crawlOpenAIApp = createCrawlOpenAI({ |
| 15 | + clientOptions: { baseURL: BASE_URL, apiKey: API_KEY }, |
| 16 | + defaultModel: { chatModel: 'gpt-4-turbo-preview' } |
16 | 17 | }) |
17 | 18 |
|
| 19 | +// crawlPage 用于爬取页面 |
18 | 20 | crawlApp.crawlPage('https://www.airbnb.cn/s/select_homes').then(async (res) => { |
19 | 21 | const { page, browser } = res.data |
20 | 22 |
|
21 | | - // await page.waitForSelector('.g1nr81q6') |
22 | | - // const sectionHTML = await page.$eval('.g1nr81q6 ', (el) => el.innerHTML) |
23 | | - await page.waitForSelector( |
24 | | - '.g1nr81q6 > a:nth-child(1), .g1nr81q6 > a:nth-child(2)' |
25 | | - ) |
26 | | - const sectionHTML = await page.$$eval( |
27 | | - '.g1nr81q6 > a:nth-child(1), .g1nr81q6 > a:nth-child(2) ', |
28 | | - (els) => els.reduce((p, v) => p + v.innerHTML, '') |
29 | | - ) |
| 23 | + // 等待元素出现在页面中, 并获取 HTML |
| 24 | + const targetSelector = '[data-tracking-id="TOP_REVIEWED_LISTINGS"]' |
| 25 | + await page.waitForSelector(targetSelector) |
| 26 | + const highlyHTML = await page.$eval(targetSelector, (el) => el.innerHTML) |
30 | 27 |
|
31 | | - const srcResult = await crawlOpenAIApp.parseElements<{ src: string }>( |
32 | | - sectionHTML, |
33 | | - `获取 img 的 src` |
| 28 | + // 让 AI 获取图片链接, 并去重 |
| 29 | + const srcResult = await crawlOpenAIApp.parseElements( |
| 30 | + highlyHTML, |
| 31 | + '获取图片链接, 不要source里面的, 并去重' |
34 | 32 | ) |
35 | 33 |
|
36 | | - console.log(srcResult) |
| 34 | + browser.close() |
37 | 35 |
|
| 36 | + // crawlFile 用于爬取文件资源 |
38 | 37 | crawlApp.crawlFile({ |
39 | 38 | targets: srcResult.elements.map((item) => item.src), |
40 | | - storeDirs: pathResolve('upload') |
| 39 | + storeDirs: pathResolve('./upload') |
41 | 40 | }) |
42 | | - |
43 | | - browser.close() |
44 | 41 | }) |
0 commit comments