diff --git a/README.md b/README.md index da49910..6b1330d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen * **Knowledge Base Articles:** Converts help center articles from HTML to clean Markdown. * **Incremental Updates:** Only processes tickets/articles updated since the last run. * **Flexible Filtering:** Filter tickets by status and priority. +* **Notion Integration:** Fetches entries from a Notion database, converting them to searchable chunks. + * **Flexible Filtering:** filter returned entries from the database using specified criteria. * **Local Directory Processing:** Scans local directories for files, converts content to searchable chunks. * **PDF Support:** Automatically extracts text from PDF files and converts them to Markdown format using Mozilla's PDF.js. * **Content Extraction:** Uses Puppeteer for rendering JavaScript-heavy pages and `@mozilla/readability` to extract the main article content. @@ -114,6 +116,12 @@ Configuration is managed through two files: * `start_date`: (Optional) Only process tickets/articles updated since this date (e.g., `'2025-01-01'`). * `ticket_status`: (Optional) Filter tickets by status (defaults to `['new', 'open', 'pending', 'hold', 'solved']`). * `ticket_priority`: (Optional) Filter tickets by priority (defaults to all priorities). + + For Notion (`type: 'notion'`): + * `api_token`: Your Notion API token (reference environment variable + as `'${NOTION_API_TOKEN}'`). + * `database_id`: The ID of the Notion database to query + * `filter`: Filter to apply when querying the database Common configuration for all types: * `product_name`: A string identifying the product (used in metadata). @@ -186,6 +194,21 @@ Configuration is managed through two files: type: 'sqlite' params: db_path: './zendesk-kb.db' + + # Notion example + - type: notion + product_name: 'notion-database' + version: 'latest' + database_id: '897e5a76ae524b489fdfe71f5945d1af' + api_token: '${NOTION_API_TOKEN}' + filter: + property: 'Status' + status: + equals: 'Current' + database_config: + type: 'sqlite' + params: + db_path: './notion-database.db' # Qdrant example - type: 'website' diff --git a/doc2vec.ts b/doc2vec.ts index 4a9d8bc..20bb01a 100644 --- a/doc2vec.ts +++ b/doc2vec.ts @@ -1,5 +1,7 @@ #!/usr/bin/env node +require('dotenv').config() + import axios from 'axios'; import crypto from 'crypto'; import * as yaml from 'js-yaml'; @@ -7,25 +9,27 @@ import * as fs from 'fs'; import * as path from 'path'; import { Buffer } from 'buffer'; import { OpenAI } from "openai"; -import * as dotenv from "dotenv"; import { Logger, LogLevel } from './logger'; import { Utils } from './utils'; import { DatabaseManager } from './database'; import { ContentProcessor } from './content-processor'; + +import { isFullPage, Client as NotionClient } from '@notionhq/client'; +import { NotionConverter } from 'notion-to-md'; + import { Config, - SourceConfig, GithubSourceConfig, WebsiteSourceConfig, LocalDirectorySourceConfig, ZendeskSourceConfig, + NotionSourceConfig, DatabaseConnection, DocumentChunk } from './types'; const GITHUB_TOKEN = process.env.GITHUB_PERSONAL_ACCESS_TOKEN; -dotenv.config(); class Doc2Vec { private config: Config; @@ -35,7 +39,6 @@ class Doc2Vec { constructor(configPath: string) { this.logger = new Logger('Doc2Vec', { - level: LogLevel.DEBUG, useTimestamp: true, useColor: true, prettyPrint: true @@ -92,6 +95,8 @@ class Doc2Vec { await this.processLocalDirectory(sourceConfig, sourceLogger); } else if (sourceConfig.type === 'zendesk') { await this.processZendesk(sourceConfig, sourceLogger); + } else if (sourceConfig.type === 'notion') { + await this.processNotionDatabase(sourceConfig, sourceLogger); } else { sourceLogger.error(`Unknown source type: ${(sourceConfig as any).type}`); } @@ -210,61 +215,7 @@ class Doc2Vec { logger.info(`Issue #${issueNumber}: Created ${chunks.length} chunks`); // Process and store each chunk immediately - for (const chunk of chunks) { - const chunkHash = Utils.generateHash(chunk.content); - const chunkId = chunk.metadata.chunk_id.substring(0, 8) + '...'; - - if (dbConnection.type === 'sqlite') { - const { checkHashStmt } = DatabaseManager.prepareSQLiteStatements(dbConnection.db); - const existing = checkHashStmt.get(chunk.metadata.chunk_id) as { hash: string } | undefined; - - if (existing && existing.hash === chunkHash) { - logger.info(`Skipping unchanged chunk: ${chunkId}`); - continue; - } - - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length) { - DatabaseManager.insertVectorsSQLite(dbConnection.db, chunk, embeddings[0], logger, chunkHash); - logger.debug(`Stored chunk ${chunkId} in SQLite`); - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - } - } else if (dbConnection.type === 'qdrant') { - try { - let pointId: string; - try { - pointId = chunk.metadata.chunk_id; - if (!Utils.isValidUuid(pointId)) { - pointId = Utils.hashToUuid(chunk.metadata.chunk_id); - } - } catch (e) { - pointId = crypto.randomUUID(); - } - - const existingPoints = await dbConnection.client.retrieve(dbConnection.collectionName, { - ids: [pointId], - with_payload: true, - with_vector: false, - }); - - if (existingPoints.length > 0 && existingPoints[0].payload && existingPoints[0].payload.hash === chunkHash) { - logger.info(`Skipping unchanged chunk: ${chunkId}`); - continue; - } - - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length) { - await DatabaseManager.storeChunkInQdrant(dbConnection, chunk, embeddings[0], chunkHash); - logger.debug(`Stored chunk ${chunkId} in Qdrant (${dbConnection.collectionName})`); - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - } - } catch (error) { - logger.error(`Error processing chunk in Qdrant:`, error); - } - } - } + await this.processAndStoreChunks(issueNumber, chunks, dbConnection, logger); }; logger.info(`Fetching GitHub issues for ${repo} since ${lastRunDate}`); @@ -320,74 +271,10 @@ class Doc2Vec { logger.info(`Created ${chunks.length} chunks`); if (chunks.length > 0) { - const chunkProgress = logger.progress(`Embedding chunks for ${url}`, chunks.length); - - for (let i = 0; i < chunks.length; i++) { - const chunk = chunks[i]; - validChunkIds.add(chunk.metadata.chunk_id); - - const chunkId = chunk.metadata.chunk_id.substring(0, 8) + '...'; - - let needsEmbedding = true; - const chunkHash = Utils.generateHash(chunk.content); - - if (dbConnection.type === 'sqlite') { - const { checkHashStmt } = DatabaseManager.prepareSQLiteStatements(dbConnection.db); - const existing = checkHashStmt.get(chunk.metadata.chunk_id) as { hash: string } | undefined; - - if (existing && existing.hash === chunkHash) { - needsEmbedding = false; - chunkProgress.update(1, `Skipping unchanged chunk ${chunkId}`); - logger.info(`Skipping unchanged chunk: ${chunkId}`); - } - } else if (dbConnection.type === 'qdrant') { - try { - let pointId: string; - try { - pointId = chunk.metadata.chunk_id; - if (!Utils.isValidUuid(pointId)) { - pointId = Utils.hashToUuid(chunk.metadata.chunk_id); - } - } catch (e) { - pointId = crypto.randomUUID(); - } - - const existingPoints = await dbConnection.client.retrieve(dbConnection.collectionName, { - ids: [pointId], - with_payload: true, - with_vector: false, - }); - - if (existingPoints.length > 0 && existingPoints[0].payload && existingPoints[0].payload.hash === chunkHash) { - needsEmbedding = false; - chunkProgress.update(1, `Skipping unchanged chunk ${chunkId}`); - logger.info(`Skipping unchanged chunk: ${chunkId}`); - } - } catch (error) { - logger.error(`Error checking existing point in Qdrant:`, error); - } - } + // Track valid chunk IDs + chunks.forEach(chunk => validChunkIds.add(chunk.metadata.chunk_id)); - - if (needsEmbedding) { - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length > 0) { - const embedding = embeddings[0]; - if (dbConnection.type === 'sqlite') { - DatabaseManager.insertVectorsSQLite(dbConnection.db, chunk, embedding, logger, chunkHash); - chunkProgress.update(1, `Stored chunk ${chunkId} in SQLite`); - } else if (dbConnection.type === 'qdrant') { - await DatabaseManager.storeChunkInQdrant(dbConnection, chunk, embedding, chunkHash); - chunkProgress.update(1, `Stored chunk ${chunkId} in Qdrant (${dbConnection.collectionName})`); - } - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - chunkProgress.update(1, `Failed to embed chunk ${chunkId}`); - } - } - } - - chunkProgress.complete(); + await this.processAndStoreChunks(url, chunks, dbConnection, logger); } } catch (error) { @@ -464,73 +351,10 @@ class Doc2Vec { logger.info(`Created ${chunks.length} chunks`); if (chunks.length > 0) { - const chunkProgress = logger.progress(`Embedding chunks for ${filePath}`, chunks.length); - - for (let i = 0; i < chunks.length; i++) { - const chunk = chunks[i]; - validChunkIds.add(chunk.metadata.chunk_id); - - const chunkId = chunk.metadata.chunk_id.substring(0, 8) + '...'; - - let needsEmbedding = true; - const chunkHash = Utils.generateHash(chunk.content); - - if (dbConnection.type === 'sqlite') { - const { checkHashStmt } = DatabaseManager.prepareSQLiteStatements(dbConnection.db); - const existing = checkHashStmt.get(chunk.metadata.chunk_id) as { hash: string } | undefined; - - if (existing && existing.hash === chunkHash) { - needsEmbedding = false; - chunkProgress.update(1, `Skipping unchanged chunk ${chunkId}`); - logger.info(`Skipping unchanged chunk: ${chunkId}`); - } - } else if (dbConnection.type === 'qdrant') { - try { - let pointId: string; - try { - pointId = chunk.metadata.chunk_id; - if (!Utils.isValidUuid(pointId)) { - pointId = Utils.hashToUuid(chunk.metadata.chunk_id); - } - } catch (e) { - pointId = crypto.randomUUID(); - } - - const existingPoints = await dbConnection.client.retrieve(dbConnection.collectionName, { - ids: [pointId], - with_payload: true, - with_vector: false, - }); - - if (existingPoints.length > 0 && existingPoints[0].payload && existingPoints[0].payload.hash === chunkHash) { - needsEmbedding = false; - chunkProgress.update(1, `Skipping unchanged chunk ${chunkId}`); - logger.info(`Skipping unchanged chunk: ${chunkId}`); - } - } catch (error) { - logger.error(`Error checking existing point in Qdrant:`, error); - } - } - - if (needsEmbedding) { - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length > 0) { - const embedding = embeddings[0]; - if (dbConnection.type === 'sqlite') { - DatabaseManager.insertVectorsSQLite(dbConnection.db, chunk, embedding, logger, chunkHash); - chunkProgress.update(1, `Stored chunk ${chunkId} in SQLite`); - } else if (dbConnection.type === 'qdrant') { - await DatabaseManager.storeChunkInQdrant(dbConnection, chunk, embedding, chunkHash); - chunkProgress.update(1, `Stored chunk ${chunkId} in Qdrant (${dbConnection.collectionName})`); - } - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - chunkProgress.update(1, `Failed to embed chunk ${chunkId}`); - } - } - } - - chunkProgress.complete(); + // Track valid chunk IDs + chunks.forEach(chunk => validChunkIds.add(chunk.metadata.chunk_id)); + + await this.processAndStoreChunks(filePath, chunks, dbConnection, logger); } } catch (error) { logger.error(`Error during chunking or embedding for ${filePath}:`, error); @@ -674,61 +498,7 @@ class Doc2Vec { logger.info(`Ticket #${ticketId}: Created ${chunks.length} chunks`); // Process and store each chunk - for (const chunk of chunks) { - const chunkHash = Utils.generateHash(chunk.content); - const chunkId = chunk.metadata.chunk_id.substring(0, 8) + '...'; - - if (dbConnection.type === 'sqlite') { - const { checkHashStmt } = DatabaseManager.prepareSQLiteStatements(dbConnection.db); - const existing = checkHashStmt.get(chunk.metadata.chunk_id) as { hash: string } | undefined; - - if (existing && existing.hash === chunkHash) { - logger.info(`Skipping unchanged chunk: ${chunkId}`); - continue; - } - - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length) { - DatabaseManager.insertVectorsSQLite(dbConnection.db, chunk, embeddings[0], logger, chunkHash); - logger.debug(`Stored chunk ${chunkId} in SQLite`); - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - } - } else if (dbConnection.type === 'qdrant') { - try { - let pointId: string; - try { - pointId = chunk.metadata.chunk_id; - if (!Utils.isValidUuid(pointId)) { - pointId = Utils.hashToUuid(chunk.metadata.chunk_id); - } - } catch (e) { - pointId = crypto.randomUUID(); - } - - const existingPoints = await dbConnection.client.retrieve(dbConnection.collectionName, { - ids: [pointId], - with_payload: true, - with_vector: false, - }); - - if (existingPoints.length > 0 && existingPoints[0].payload && existingPoints[0].payload.hash === chunkHash) { - logger.info(`Skipping unchanged chunk: ${chunkId}`); - continue; - } - - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length) { - await DatabaseManager.storeChunkInQdrant(dbConnection, chunk, embeddings[0], chunkHash); - logger.debug(`Stored chunk ${chunkId} in Qdrant (${dbConnection.collectionName})`); - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - } - } catch (error) { - logger.error(`Error processing chunk in Qdrant:`, error); - } - } - } + await this.processAndStoreChunks(ticketId, chunks, dbConnection, logger); }; logger.info(`Fetching Zendesk tickets updated since ${lastRunDate}`); @@ -851,61 +621,7 @@ class Doc2Vec { logger.info(`Article #${articleId}: Created ${chunks.length} chunks`); // Process and store each chunk (similar to ticket processing) - for (const chunk of chunks) { - const chunkHash = Utils.generateHash(chunk.content); - const chunkId = chunk.metadata.chunk_id.substring(0, 8) + '...'; - - if (dbConnection.type === 'sqlite') { - const { checkHashStmt } = DatabaseManager.prepareSQLiteStatements(dbConnection.db); - const existing = checkHashStmt.get(chunk.metadata.chunk_id) as { hash: string } | undefined; - - if (existing && existing.hash === chunkHash) { - logger.info(`Skipping unchanged chunk: ${chunkId}`); - continue; - } - - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length) { - DatabaseManager.insertVectorsSQLite(dbConnection.db, chunk, embeddings[0], logger, chunkHash); - logger.debug(`Stored chunk ${chunkId} in SQLite`); - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - } - } else if (dbConnection.type === 'qdrant') { - try { - let pointId: string; - try { - pointId = chunk.metadata.chunk_id; - if (!Utils.isValidUuid(pointId)) { - pointId = Utils.hashToUuid(chunk.metadata.chunk_id); - } - } catch (e) { - pointId = crypto.randomUUID(); - } - - const existingPoints = await dbConnection.client.retrieve(dbConnection.collectionName, { - ids: [pointId], - with_payload: true, - with_vector: false, - }); - - if (existingPoints.length > 0 && existingPoints[0].payload && existingPoints[0].payload.hash === chunkHash) { - logger.info(`Skipping unchanged chunk: ${chunkId}`); - continue; - } - - const embeddings = await this.createEmbeddings([chunk.content]); - if (embeddings.length) { - await DatabaseManager.storeChunkInQdrant(dbConnection, chunk, embeddings[0], chunkHash); - logger.debug(`Stored chunk ${chunkId} in Qdrant (${dbConnection.collectionName})`); - } else { - logger.error(`Embedding failed for chunk: ${chunkId}`); - } - } catch (error) { - logger.error(`Error processing chunk in Qdrant:`, error); - } - } - } + await this.processAndStoreChunks(articleId, chunks, dbConnection, logger); }; logger.info(`Fetching Zendesk help center articles updated since ${startDate}`); @@ -945,6 +661,112 @@ class Doc2Vec { logger.info(`Successfully processed ${processedArticles} of ${totalArticles} articles (filtered by date >= ${startDate})`); } + private async processNotionDatabase(config: NotionSourceConfig, parentLogger: Logger): Promise { + const logger = parentLogger.child('process'); + logger.info(`Starting processing for Notion: ${config.database_id}`); + + const dbConnection = await DatabaseManager.initDatabase(config, logger); + + // Initialize metadata storage + await DatabaseManager.initDatabaseMetadata(dbConnection, logger); + + const notion = new NotionClient({ + auth: config.api_token, + }); + + // Create a NotionConverter instance + const n2m = new NotionConverter(notion); + + // Chunk the markdown content + const pageConfig = { + ...config, + product_name: config.product_name || 'notion', + max_size: config.max_size || Infinity + }; + + const getMarkdownForPage = async (page_id: string): Promise<[string, string]> => { + let pageUrl = ''; + try { + logger.debug(`Retrieving Notion page: ${page_id}`); + + // Retrieve the page and convert to markdown + const page = await notion.pages.retrieve({ page_id: page_id }); + if (!isFullPage(page)) { + logger.info(`Skipping partial page #${page_id}`); + return ['', '']; + } + + const nameProperty = page.properties.Name; + const pageTitle = nameProperty && nameProperty.type === 'title' && nameProperty.title?.[0]?.plain_text || 'Untitled'; + + pageUrl = page.url; + + logger.debug(`Generating markdown for page: ${pageUrl}`); + + const md = await n2m.convert(page_id); + const mdWithNoImages = md.content.replace(/!\[.*?\]\(.*?\)[\s\n]*/g, ''); + + return [page.url, `# ${pageTitle}${mdWithNoImages}`]; + } catch (error) { + logger.error(`Failed to generate markdown for Notion page ${page_id} (${pageUrl}):`, error); + return ['', '']; + } + } + + const processPage = async (page_id: string): Promise => { + logger.info(`Processing Notion page: ${page_id}`); + + const [url, md] = await getMarkdownForPage(page_id); + if (!md) { + logger.info(`No markdown for Notion page: ${page_id}`); + return + } + + const chunks = await this.contentProcessor.chunkMarkdown(md, pageConfig, url); + + // Process and store each chunk immediately + await this.processAndStoreChunks(page_id, chunks, dbConnection, logger); + + logger.debug(`Finished processing Notion page: ${page_id}`); + } + + const processDatabase = async (database_id: string): Promise => { + logger.info(`Processing Notion database: ${database_id}`); + + let next_cursor = undefined; + + try { + do { + const response = await notion.databases.query({ + database_id: database_id, + start_cursor: next_cursor, + filter: config.filter, + }); + + for (const res of response.results) { + if (res.object === "database") { + await processDatabase(res.id); + } else if (res.object === "page") { + await processPage(res.id); + } else { + logger.error("unknown database object: ${res.object}"); + } + } + + next_cursor = response.next_cursor; + } while (next_cursor) + + logger.debug(`Finished processing Notion database: ${database_id}`); + } catch (error) { + logger.error(`Failed to process Notion database ${database_id}:`, error); + } + } + + await processDatabase(config.database_id); + + logger.info(`Completed processing Notion database: ${config.database_id}`); + } + private async createEmbeddings(texts: string[]): Promise { const logger = this.logger.child('embeddings'); try { @@ -960,6 +782,88 @@ class Doc2Vec { return []; } } + + private async processAndStoreChunks( + itemId: string, + chunks: DocumentChunk[], + dbConnection: DatabaseConnection, + logger: Logger, + ): Promise { + const chunkProgress = logger.progress(`Processing chunks for ${itemId}`, chunks.length); + + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + const chunkId = chunk.metadata.chunk_id.substring(0, 8) + '...'; + const chunkHash = Utils.generateHash(chunk.content); + + try { + const existing = await this.checkExistingChunk(chunk, chunkHash, dbConnection); + + if (existing) { + chunkProgress.update(1, `Skipping unchanged chunk: ${chunkId}`); + continue; + } + } catch (error) { + const message = `Error checking existing point in database for chunk ${chunkId}: ${error}` + logger.error(message); + chunkProgress.update(1, message); + continue; + } + + const embeddings = await this.createEmbeddings([chunk.content]); + if (embeddings.length == 0) { + chunkProgress.update(1, `Embedding failed for chunk: ${chunkId}`); + continue; + } + + const embedding = embeddings[0]; + let message = ''; + if (dbConnection.type === 'sqlite') { + DatabaseManager.insertVectorsSQLite(dbConnection.db, chunk, embedding, logger, chunkHash); + message = `Stored chunk ${chunkId} in SQLite`; + } else if (dbConnection.type === 'qdrant') { + await DatabaseManager.storeChunkInQdrant(dbConnection, chunk, embedding, chunkHash); + message = `Stored chunk ${chunkId} in Qdrant (${dbConnection.collectionName})`; + } + + chunkProgress.update(1, message); + } + + chunkProgress.complete(); + } + + private async checkExistingChunk(chunk: DocumentChunk, chunkHash: string, dbConnection: DatabaseConnection): Promise { + if (dbConnection.type === 'sqlite') { + const { checkHashStmt } = DatabaseManager.prepareSQLiteStatements(dbConnection.db); + const existing = checkHashStmt.get(chunk.metadata.chunk_id) as { hash: string } | undefined; + + if (existing && existing.hash === chunkHash) { + return true; + } + } else if (dbConnection.type === 'qdrant') { + let pointId: string; + try { + pointId = chunk.metadata.chunk_id; + if (!Utils.isValidUuid(pointId)) { + pointId = Utils.hashToUuid(chunk.metadata.chunk_id); + } + } catch (e) { + pointId = crypto.randomUUID(); + } + + const existingPoints = await dbConnection.client.retrieve(dbConnection.collectionName, { + ids: [pointId], + with_payload: true, + with_vector: false, + }); + + if (existingPoints.length > 0 && existingPoints[0].payload && existingPoints[0].payload.hash === chunkHash) { + return true; + } + } + + return false; + } } if (require.main === module) { diff --git a/logger.ts b/logger.ts index 1226989..3969e5d 100644 --- a/logger.ts +++ b/logger.ts @@ -7,281 +7,324 @@ * Logger levels with their corresponding numeric values */ enum LogLevel { - DEBUG = 0, - INFO = 1, - WARN = 2, - ERROR = 3, - NONE = 100 - } + DEBUG = 0, + INFO = 1, + WARN = 2, + ERROR = 3, + NONE = 100 +} + +/** + * Configuration options for the Logger + */ +interface LoggerConfig { + useTimestamp: boolean; + useColor: boolean; + logToFile?: string; + prettyPrint?: boolean; +} - /** - * Configuration options for the Logger - */ - interface LoggerConfig { - level: LogLevel; - useTimestamp: boolean; - useColor: boolean; - logToFile?: string; - prettyPrint?: boolean; +/** + * Basic color functions that don't rely on external packages + */ +const colors = { + gray: (text: string) => `\x1b[90m${text}\x1b[0m`, + blue: (text: string) => `\x1b[34m${text}\x1b[0m`, + yellow: (text: string) => `\x1b[33m${text}\x1b[0m`, + red: (text: string) => `\x1b[31m${text}\x1b[0m`, + green: (text: string) => `\x1b[32m${text}\x1b[0m`, + reset: (text: string) => `\x1b[0m${text}\x1b[0m` +}; + +/** + * Parse log level from string + */ +const parseLogLevel = (level?: string): LogLevel | undefined => { + if (!level) return undefined; + const upperLevel = level.toUpperCase(); + switch (upperLevel) { + case 'DEBUG': return LogLevel.DEBUG; + case 'INFO': return LogLevel.INFO; + case 'WARN': return LogLevel.WARN; + case 'ERROR': return LogLevel.ERROR; + case 'NONE': return LogLevel.NONE; + default: return undefined; } +} + +/** + * Overwrite standard console Logger to only log messages at the configured log level + */ +declare global { + var logLevel: LogLevel; +} +global.logLevel = parseLogLevel(process.env.LOG_LEVEL) ?? LogLevel.INFO; + +const _console = console +global.console = { + ...global.console, + log: (message?: any, ...optionalParams: any[]) => { + shouldLog(LogLevel.INFO) && _console.log(message, ...optionalParams); + }, + warn: (message?: any, ...optionalParams: any[]) => { + shouldLog(LogLevel.WARN) && _console.warn(message, ...optionalParams); + }, + error: (message?: any, ...optionalParams: any[]) => { + shouldLog(LogLevel.ERROR) && _console.error(message, ...optionalParams); + }, + debug: (message?: any, ...optionalParams: any[]) => { + shouldLog(LogLevel.DEBUG) && _console.debug(message, ...optionalParams); + }, +}; + +const shouldLog = (level: LogLevel) => { + return global.logLevel <= level +}; +/** + * Enhanced Logger class with color-coding, timestamp, and formatting + */ +class Logger { + private config: LoggerConfig; + private moduleName: string; + /** - * Basic color functions that don't rely on external packages + * Create a new Logger instance + * + * @param moduleName Name of the module using this logger + * @param config Logger configuration options */ - const colors = { - gray: (text: string) => `\x1b[90m${text}\x1b[0m`, - blue: (text: string) => `\x1b[34m${text}\x1b[0m`, - yellow: (text: string) => `\x1b[33m${text}\x1b[0m`, - red: (text: string) => `\x1b[31m${text}\x1b[0m`, - green: (text: string) => `\x1b[32m${text}\x1b[0m`, - reset: (text: string) => `\x1b[0m${text}\x1b[0m` - }; - + constructor(moduleName: string, config?: Partial) { + this.moduleName = moduleName; + this.config = { + useTimestamp: true, + useColor: true, + prettyPrint: true, + ...config + }; + } + /** - * Enhanced Logger class with color-coding, timestamp, and formatting + * Format a log message with timestamp, level, and module information + * + * @param level Log level for this message + * @param message The message to log + * @param args Additional arguments to include + * @returns Formatted log message */ - class Logger { - private config: LoggerConfig; - private moduleName: string; - - /** - * Create a new Logger instance - * - * @param moduleName Name of the module using this logger - * @param config Logger configuration options - */ - constructor(moduleName: string, config?: Partial) { - this.moduleName = moduleName; - this.config = { - level: LogLevel.INFO, - useTimestamp: true, - useColor: true, - prettyPrint: true, - ...config - }; - } - - /** - * Format a log message with timestamp, level, and module information - * - * @param level Log level for this message - * @param message The message to log - * @param args Additional arguments to include - * @returns Formatted log message - */ - private formatMessage(level: string, message: string, args: any[] = []): string { - const timestamp = this.config.useTimestamp ? - `[${new Date().toISOString()}] ` : ''; - - const modulePrefix = this.moduleName ? - `[${this.moduleName}] ` : ''; - - const levelFormatted = `[${level.padEnd(5)}]`; - - let formattedMessage = `${timestamp}${levelFormatted} ${modulePrefix}${message}`; - - if (args.length > 0) { - const argsString = args.map(arg => { - if (arg instanceof Error) { - return `\n--- Error Details ---\nMessage: ${arg.message}\nStack:\n${arg.stack}\n--- End Error ---`; - } - else if (this.config.prettyPrint && typeof arg === 'object' && arg !== null) { - try { - return JSON.stringify(arg, null, 2); - } catch (e) { - return "[Unserializable Object]"; - } - } else { - return String(arg); + private formatMessage(level: string, message: string, args: any[] = []): string { + const timestamp = this.config.useTimestamp ? + `[${new Date().toISOString()}] ` : ''; + + const modulePrefix = this.moduleName ? + `[${this.moduleName}] ` : ''; + + const levelFormatted = `[${level.padEnd(5)}]`; + + let formattedMessage = `${timestamp}${levelFormatted} ${modulePrefix}${message}`; + + if (args.length > 0) { + const argsString = args.map(arg => { + if (arg instanceof Error) { + return `\n--- Error Details ---\nMessage: ${arg.message}\nStack:\n${arg.stack}\n--- End Error ---`; + } + else if (this.config.prettyPrint && typeof arg === 'object' && arg !== null) { + try { + return JSON.stringify(arg, null, 2); + } catch (e) { + return "[Unserializable Object]"; } - }).join('\n'); - - if (this.config.prettyPrint) { - formattedMessage += `\n${argsString}`; } else { - formattedMessage += ` ${args.map(String).join(' ')}`; + return String(arg); } - } - - return formattedMessage; - } - - /** - * Apply color to a message based on log level - * - * @param level Log level - * @param message Message to color - * @returns Colored message - */ - private colorize(level: LogLevel, message: string): string { - if (!this.config.useColor) return message; - - switch (level) { - case LogLevel.DEBUG: - return colors.gray(message); - case LogLevel.INFO: - return colors.blue(message); - case LogLevel.WARN: - return colors.yellow(message); - case LogLevel.ERROR: - return colors.red(message); - default: - return message; + }).join('\n'); + + if (this.config.prettyPrint) { + formattedMessage += `\n${argsString}`; + } else { + formattedMessage += ` ${args.map(String).join(' ')}`; } } - - /** - * Log a debug message - * - * @param message Message to log - * @param args Additional arguments - */ - debug(message: string, ...args: any[]): void { - if (this.config.level <= LogLevel.DEBUG) { - const formattedMessage = this.formatMessage('DEBUG', message, args); - console.log(this.colorize(LogLevel.DEBUG, formattedMessage)); - } + + return formattedMessage; + } + + /** + * Apply color to a message based on log level + * + * @param level Log level + * @param message Message to color + * @returns Colored message + */ + private colorize(level: LogLevel, message: string): string { + if (!this.config.useColor) return message; + + switch (level) { + case LogLevel.DEBUG: + return colors.gray(message); + case LogLevel.INFO: + return colors.blue(message); + case LogLevel.WARN: + return colors.yellow(message); + case LogLevel.ERROR: + return colors.red(message); + default: + return message; } - - /** - * Log an info message - * - * @param message Message to log - * @param args Additional arguments - */ - info(message: string, ...args: any[]): void { - if (this.config.level <= LogLevel.INFO) { - const formattedMessage = this.formatMessage('INFO', message, args); - console.log(this.colorize(LogLevel.INFO, formattedMessage)); - } + } + + /** + * Log a debug message + * + * @param message Message to log + * @param args Additional arguments + */ + debug(message: string, ...args: any[]): void { + if (shouldLog(LogLevel.DEBUG)) { + const formattedMessage = this.formatMessage('DEBUG', message, args); + _console.log(this.colorize(LogLevel.DEBUG, formattedMessage)); } - - /** - * Log a warning message - * - * @param message Message to log - * @param args Additional arguments - */ - warn(message: string, ...args: any[]): void { - if (this.config.level <= LogLevel.WARN) { - const formattedMessage = this.formatMessage('WARN', message, args); - console.warn(this.colorize(LogLevel.WARN, formattedMessage)); - } + } + + /** + * Log an info message + * + * @param message Message to log + * @param args Additional arguments + */ + info(message: string, ...args: any[]): void { + if (shouldLog(LogLevel.INFO)) { + const formattedMessage = this.formatMessage('INFO', message, args); + _console.log(this.colorize(LogLevel.INFO, formattedMessage)); } - - /** - * Log an error message - * - * @param message Message to log - * @param args Additional arguments - */ - error(message: string, ...args: any[]): void { - if (this.config.level <= LogLevel.ERROR) { - const formattedMessage = this.formatMessage('ERROR', message, args); - console.error(this.colorize(LogLevel.ERROR, formattedMessage)); - } + } + + /** + * Log a warning message + * + * @param message Message to log + * @param args Additional arguments + */ + warn(message: string, ...args: any[]): void { + if (shouldLog(LogLevel.WARN)) { + const formattedMessage = this.formatMessage('WARN', message, args); + _console.warn(this.colorize(LogLevel.WARN, formattedMessage)); } - - /** - * Create a child logger with a more specific module name - * - * @param subModule Name of the sub-module - * @returns New logger instance - */ - child(subModule: string): Logger { - return new Logger(`${this.moduleName}:${subModule}`, this.config); + } + + /** + * Log an error message + * + * @param message Message to log + * @param args Additional arguments + */ + error(message: string, ...args: any[]): void { + if (shouldLog(LogLevel.ERROR)) { + const formattedMessage = this.formatMessage('ERROR', message, args); + _console.error(this.colorize(LogLevel.ERROR, formattedMessage)); } - - /** - * Format a section header to clearly separate logical parts of execution - * - * @param title Section title - * @returns Logger instance for chaining - */ - section(title: string): Logger { - if (this.config.level <= LogLevel.INFO) { - const separator = '='.repeat(Math.max(80 - title.length - 4, 10)); - const message = `${separator} ${title} ${separator}`; - console.log(this.colorize(LogLevel.INFO, message)); - } - return this; + } + + /** + * Create a child logger with a more specific module name + * + * @param subModule Name of the sub-module + * @returns New logger instance + */ + child(subModule: string): Logger { + return new Logger(`${this.moduleName}:${subModule}`, this.config); + } + + /** + * Format a section header to clearly separate logical parts of execution + * + * @param title Section title + * @returns Logger instance for chaining + */ + section(title: string): Logger { + if (shouldLog(LogLevel.INFO)) { + const separator = '='.repeat(Math.max(80 - title.length - 4, 10)); + const message = `${separator} ${title} ${separator}`; + _console.log(this.colorize(LogLevel.INFO, message)); } - - /** - * Create a progress indicator - * - * @param title Title of the operation - * @param total Total number of items to process - * @returns Object with update and complete methods - */ - progress(title: string, total: number) { - let current = 0; - const startTime = Date.now(); + return this; + } + + /** + * Create a progress indicator + * + * @param title Title of the operation + * @param total Total number of items to process + * @returns Object with update and complete methods + */ + progress(title: string, total: number) { + let current = 0; + const startTime = Date.now(); + + const update = (increment = 1, message?: string) => { + if (!shouldLog(LogLevel.INFO)) return; - const update = (increment = 1, message?: string) => { - if (this.config.level > LogLevel.INFO) return; - - current += increment; - const percentage = Math.min(Math.floor((current / total) * 100), 100); - const elapsed = (Date.now() - startTime) / 1000; - let rate = current / elapsed; - - let timeRemaining = ''; - if (rate > 0 && current < total) { - const remainingSecs = (total - current) / rate; - timeRemaining = `, ETA: ${Math.floor(remainingSecs / 60)}m ${Math.floor(remainingSecs % 60)}s`; - } - - const progressBar = this.createProgressBar(percentage); - - const statusMsg = message ? ` - ${message}` : ''; - console.log(this.colorize( - LogLevel.INFO, - this.formatMessage('INFO', `${title}: ${progressBar} ${percentage}% (${current}/${total}${timeRemaining})${statusMsg}`) - )); - }; + current += increment; + const percentage = Math.min(Math.floor((current / total) * 100), 100); + const elapsed = (Date.now() - startTime) / 1000; + let rate = current / elapsed; - const complete = (message = 'Completed') => { - if (this.config.level > LogLevel.INFO) return; - - const elapsed = (Date.now() - startTime) / 1000; - const rate = total / elapsed; - - console.log(this.colorize( - LogLevel.INFO, - this.formatMessage('INFO', `${title}: ${this.createProgressBar(100)} 100% (${total}/${total}) - ${message} in ${elapsed.toFixed(2)}s (${rate.toFixed(2)} items/sec)`) - )); - }; + let timeRemaining = ''; + if (rate > 0 && current < total) { + const remainingSecs = (total - current) / rate; + timeRemaining = `, ETA: ${Math.floor(remainingSecs / 60)}m ${Math.floor(remainingSecs % 60)}s`; + } - return { update, complete }; - } - - /** - * Create a visual progress bar - * - * @param percentage Completion percentage - * @returns Visual progress bar - */ - private createProgressBar(percentage: number): string { - const width = 20; - const completeChars = Math.floor((percentage / 100) * width); - const incompleteChars = width - completeChars; + const progressBar = this.createProgressBar(percentage); - let bar = '['; - if (this.config.useColor) { - bar += colors.green('='.repeat(completeChars)); - bar += ' '.repeat(incompleteChars); - } else { - bar += '='.repeat(completeChars); - bar += ' '.repeat(incompleteChars); - } - bar += ']'; + const statusMsg = message ? ` - ${message}` : ''; + _console.log(this.colorize( + LogLevel.INFO, + this.formatMessage('INFO', `${title}: ${progressBar} ${percentage}% (${current}/${total}${timeRemaining})${statusMsg}`) + )); + }; + + const complete = (message = 'Completed') => { + if (!shouldLog(LogLevel.INFO)) return; + + const elapsed = (Date.now() - startTime) / 1000; + const rate = total / elapsed; - return bar; + _console.log(this.colorize( + LogLevel.INFO, + this.formatMessage('INFO', `${title}: ${this.createProgressBar(100)} 100% (${total}/${total}) - ${message} in ${elapsed.toFixed(2)}s (${rate.toFixed(2)} items/sec)`) + )); + }; + + return { update, complete }; + } + + /** + * Create a visual progress bar + * + * @param percentage Completion percentage + * @returns Visual progress bar + */ + private createProgressBar(percentage: number): string { + const width = 20; + const completeChars = Math.floor((percentage / 100) * width); + const incompleteChars = width - completeChars; + + let bar = '['; + if (this.config.useColor) { + bar += colors.green('='.repeat(completeChars)); + bar += ' '.repeat(incompleteChars); + } else { + bar += '='.repeat(completeChars); + bar += ' '.repeat(incompleteChars); } + bar += ']'; + + return bar; } +} - // Create a default logger instance - const defaultLogger = new Logger('app'); - - export { Logger, LogLevel, defaultLogger }; \ No newline at end of file +// Create a default logger instance +const defaultLogger = new Logger('app'); + +export { Logger, LogLevel, defaultLogger }; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 4afa738..4d3f3d1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,15 +1,16 @@ { "name": "doc2vec", - "version": "1.1.1", + "version": "1.3.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "doc2vec", - "version": "1.1.1", + "version": "1.3.0", "license": "ISC", "dependencies": { "@mozilla/readability": "^0.4.4", + "@notionhq/client": "^2.3.0", "@qdrant/js-client-rest": "^1.13.0", "@qdrant/qdrant-js": "^1.13.0", "axios": "^1.6.2", @@ -19,6 +20,7 @@ "dotenv": "^16.3.1", "js-yaml": "^4.1.0", "jsdom": "^26.0.0", + "notion-to-md": "^4.0.0-alpha.7", "openai": "^4.20.1", "pdfjs-dist": "^5.3.31", "puppeteer": "^24.1.1", @@ -112,7 +114,6 @@ "version": "0.8.1", "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", - "dev": true, "dependencies": { "@jridgewell/trace-mapping": "0.3.9" }, @@ -237,7 +238,6 @@ "version": "3.1.2", "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", - "dev": true, "engines": { "node": ">=6.0.0" } @@ -245,14 +245,12 @@ "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz", - "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==", - "dev": true + "integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==" }, "node_modules/@jridgewell/trace-mapping": { "version": "0.3.9", "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", - "dev": true, "dependencies": { "@jridgewell/resolve-uri": "^3.0.3", "@jridgewell/sourcemap-codec": "^1.4.10" @@ -453,6 +451,19 @@ "node": ">= 10" } }, + "node_modules/@notionhq/client": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/@notionhq/client/-/client-2.3.0.tgz", + "integrity": "sha512-l7WqTCpQqC+HibkB9chghONQTYcxNQT0/rOJemBfmuKQRTu2vuV8B3yA395iKaUdDo7HI+0KvQaz9687Xskzkw==", + "license": "MIT", + "dependencies": { + "@types/node-fetch": "^2.5.10", + "node-fetch": "^2.6.1" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/@puppeteer/browsers": { "version": "2.8.0", "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.8.0.tgz", @@ -568,26 +579,22 @@ "node_modules/@tsconfig/node10": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", - "integrity": "sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==", - "dev": true + "integrity": "sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==" }, "node_modules/@tsconfig/node12": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", - "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", - "dev": true + "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==" }, "node_modules/@tsconfig/node14": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", - "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", - "dev": true + "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==" }, "node_modules/@tsconfig/node16": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", - "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==", - "dev": true + "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, "node_modules/@types/better-sqlite3": { "version": "7.6.12", @@ -677,7 +684,6 @@ "version": "8.14.1", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.1.tgz", "integrity": "sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==", - "dev": true, "bin": { "acorn": "bin/acorn" }, @@ -689,7 +695,6 @@ "version": "8.3.4", "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.4.tgz", "integrity": "sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==", - "dev": true, "dependencies": { "acorn": "^8.11.0" }, @@ -741,8 +746,7 @@ "node_modules/arg": { "version": "4.1.3", "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", - "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", - "dev": true + "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==" }, "node_modules/argparse": { "version": "2.0.1", @@ -1121,8 +1125,7 @@ "node_modules/create-require": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", - "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", - "dev": true + "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==" }, "node_modules/css-select": { "version": "5.1.0", @@ -1302,7 +1305,6 @@ "version": "4.0.2", "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", - "dev": true, "engines": { "node": ">=0.3.1" } @@ -2051,8 +2053,7 @@ "node_modules/make-error": { "version": "1.3.6", "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", - "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", - "dev": true + "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==" }, "node_modules/math-intrinsics": { "version": "1.1.0", @@ -2062,6 +2063,18 @@ "node": ">= 0.4" } }, + "node_modules/mime": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-3.0.0.tgz", + "integrity": "sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==", + "license": "MIT", + "bin": { + "mime": "cli.js" + }, + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/mime-db": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", @@ -2193,6 +2206,23 @@ } } }, + "node_modules/notion-to-md": { + "version": "4.0.0-alpha.7", + "resolved": "https://registry.npmjs.org/notion-to-md/-/notion-to-md-4.0.0-alpha.7.tgz", + "integrity": "sha512-3kocKMEVcivy2ccuv2uZDJQFKXdvRmsujbN2GeOwP6yoNqhj/c/fmXroqPkk4XXRqNdJB2jzf5NPhPSWpuZkdA==", + "license": "MIT", + "dependencies": { + "mime": "^3.0.0", + "node-fetch": "^2.7.0", + "ts-node": "^10.9.2" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "@notionhq/client": "^2.0.0" + } + }, "node_modules/nth-check": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", @@ -2920,7 +2950,6 @@ "version": "10.9.2", "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz", "integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==", - "dev": true, "dependencies": { "@cspotcode/source-map-support": "^0.8.0", "@tsconfig/node10": "^1.0.7", @@ -3024,8 +3053,7 @@ "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", - "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", - "dev": true + "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==" }, "node_modules/w3c-xmlserializer": { "version": "5.0.0", @@ -3179,7 +3207,6 @@ "version": "3.1.1", "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", - "dev": true, "engines": { "node": ">=6" } diff --git a/package.json b/package.json index 58cffbb..5d03c45 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "license": "ISC", "dependencies": { "@mozilla/readability": "^0.4.4", + "@notionhq/client": "^2.3.0", "@qdrant/js-client-rest": "^1.13.0", "@qdrant/qdrant-js": "^1.13.0", "axios": "^1.6.2", @@ -36,6 +37,7 @@ "dotenv": "^16.3.1", "js-yaml": "^4.1.0", "jsdom": "^26.0.0", + "notion-to-md": "^4.0.0-alpha.7", "openai": "^4.20.1", "pdfjs-dist": "^5.3.31", "puppeteer": "^24.1.1", diff --git a/types.ts b/types.ts index db61f39..bf20caf 100644 --- a/types.ts +++ b/types.ts @@ -1,6 +1,6 @@ // Base configuration that applies to all source types export interface BaseSourceConfig { - type: 'website' | 'github' | 'local_directory' | 'zendesk'; + type: 'website' | 'github' | 'local_directory' | 'zendesk' | 'notion'; product_name: string; version: string; max_size: number; @@ -45,8 +45,15 @@ export interface ZendeskSourceConfig extends BaseSourceConfig { ticket_priority?: string[]; // Filter tickets by priority (default: all) } +export interface NotionSourceConfig extends BaseSourceConfig { + type: 'notion'; + api_token: string; // Notion API token + database_id: string; // Notion Database ID + filter?: any; // Filter for the database query - see https://developers.notion.com/reference/post-database-query for more details +} + // Union type for all possible source configurations -export type SourceConfig = WebsiteSourceConfig | GithubSourceConfig | LocalDirectorySourceConfig | ZendeskSourceConfig; +export type SourceConfig = WebsiteSourceConfig | GithubSourceConfig | LocalDirectorySourceConfig | ZendeskSourceConfig | NotionSourceConfig; // Database configuration export interface DatabaseConfig {