Skip to content

Commit c1e95bc

Browse files
committed
feat: add support for Notion database source
1 parent f8dc3bd commit c1e95bc

File tree

6 files changed

+482
-285
lines changed

6 files changed

+482
-285
lines changed

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ The primary goal is to prepare documentation content for Retrieval-Augmented Gen
1717
* **Knowledge Base Articles:** Converts help center articles from HTML to clean Markdown.
1818
* **Incremental Updates:** Only processes tickets/articles updated since the last run.
1919
* **Flexible Filtering:** Filter tickets by status and priority.
20+
* **Notion Integration:** Fetches entries from a Notion database, converting them to searchable chunks.
21+
* **Flexible Filtering:** filter returned entries from the database using specified criteria.
2022
* **Local Directory Processing:** Scans local directories for files, converts content to searchable chunks.
2123
* **PDF Support:** Automatically extracts text from PDF files and converts them to Markdown format using Mozilla's PDF.js.
2224
* **Content Extraction:** Uses Puppeteer for rendering JavaScript-heavy pages and `@mozilla/readability` to extract the main article content.
@@ -114,6 +116,12 @@ Configuration is managed through two files:
114116
* `start_date`: (Optional) Only process tickets/articles updated since this date (e.g., `'2025-01-01'`).
115117
* `ticket_status`: (Optional) Filter tickets by status (defaults to `['new', 'open', 'pending', 'hold', 'solved']`).
116118
* `ticket_priority`: (Optional) Filter tickets by priority (defaults to all priorities).
119+
120+
For Notion (`type: 'notion'`):
121+
* `api_token`: Your Notion API token (reference environment variable
122+
as `'${NOTION_API_TOKEN}'`).
123+
* `database_id`: The ID of the Notion database to query
124+
* `filter`: Filter to apply when querying the database
117125

118126
Common configuration for all types:
119127
* `product_name`: A string identifying the product (used in metadata).
@@ -186,6 +194,21 @@ Configuration is managed through two files:
186194
type: 'sqlite'
187195
params:
188196
db_path: './zendesk-kb.db'
197+
198+
# Notion example
199+
- type: notion
200+
product_name: 'notion-database'
201+
version: 'latest'
202+
database_id: '897e5a76ae524b489fdfe71f5945d1af'
203+
api_token: '${NOTION_API_TOKEN}'
204+
filter:
205+
property: 'Status'
206+
status:
207+
equals: 'Current'
208+
database_config:
209+
type: 'sqlite'
210+
params:
211+
db_path: './notion-database.db'
189212
190213
# Qdrant example
191214
- type: 'website'

doc2vec.ts

Lines changed: 113 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@ import { Logger, LogLevel } from './logger';
1212
import { Utils } from './utils';
1313
import { DatabaseManager } from './database';
1414
import { ContentProcessor } from './content-processor';
15+
16+
import { isFullPage, Client as NotionClient } from '@notionhq/client';
17+
import { NotionConverter } from 'notion-to-md';
18+
1519
import {
1620
Config,
17-
SourceConfig,
1821
GithubSourceConfig,
1922
WebsiteSourceConfig,
2023
LocalDirectorySourceConfig,
2124
ZendeskSourceConfig,
25+
NotionSourceConfig,
2226
DatabaseConnection,
2327
DocumentChunk
2428
} from './types';
@@ -35,7 +39,6 @@ class Doc2Vec {
3539

3640
constructor(configPath: string) {
3741
this.logger = new Logger('Doc2Vec', {
38-
level: LogLevel.DEBUG,
3942
useTimestamp: true,
4043
useColor: true,
4144
prettyPrint: true
@@ -92,6 +95,8 @@ class Doc2Vec {
9295
await this.processLocalDirectory(sourceConfig, sourceLogger);
9396
} else if (sourceConfig.type === 'zendesk') {
9497
await this.processZendesk(sourceConfig, sourceLogger);
98+
} else if (sourceConfig.type === 'notion') {
99+
await this.processNotionDatabase(sourceConfig, sourceLogger);
95100
} else {
96101
sourceLogger.error(`Unknown source type: ${(sourceConfig as any).type}`);
97102
}
@@ -656,6 +661,112 @@ class Doc2Vec {
656661
logger.info(`Successfully processed ${processedArticles} of ${totalArticles} articles (filtered by date >= ${startDate})`);
657662
}
658663

664+
private async processNotionDatabase(config: NotionSourceConfig, parentLogger: Logger): Promise<void> {
665+
const logger = parentLogger.child('process');
666+
logger.info(`Starting processing for Notion: ${config.database_id}`);
667+
668+
const dbConnection = await DatabaseManager.initDatabase(config, logger);
669+
670+
// Initialize metadata storage
671+
await DatabaseManager.initDatabaseMetadata(dbConnection, logger);
672+
673+
const notion = new NotionClient({
674+
auth: config.api_token,
675+
});
676+
677+
// Create a NotionConverter instance
678+
const n2m = new NotionConverter(notion);
679+
680+
// Chunk the markdown content
681+
const pageConfig = {
682+
...config,
683+
product_name: config.product_name || 'notion',
684+
max_size: config.max_size || Infinity
685+
};
686+
687+
const getMarkdownForPage = async (page_id: string): Promise<[string, string]> => {
688+
let pageUrl = '';
689+
try {
690+
logger.debug(`Retrieving Notion page: ${page_id}`);
691+
692+
// Retrieve the page and convert to markdown
693+
const page = await notion.pages.retrieve({ page_id: page_id });
694+
if (!isFullPage(page)) {
695+
logger.info(`Skipping partial page #${page_id}`);
696+
return ['', ''];
697+
}
698+
699+
const nameProperty = page.properties.Name;
700+
const pageTitle = nameProperty && nameProperty.type === 'title' && nameProperty.title?.[0]?.plain_text || 'Untitled';
701+
702+
pageUrl = page.url;
703+
704+
logger.debug(`Generating markdown for page: ${pageUrl}`);
705+
706+
const md = await n2m.convert(page_id);
707+
const mdWithNoImages = md.content.replace(/!\[.*?\]\(.*?\)[\s\n]*/g, '');
708+
709+
return [page.url, `# ${pageTitle}${mdWithNoImages}`];
710+
} catch (error) {
711+
logger.error(`Failed to generate markdown for Notion page ${page_id} (${pageUrl}):`, error);
712+
return ['', ''];
713+
}
714+
}
715+
716+
const processPage = async (page_id: string): Promise<void> => {
717+
logger.info(`Processing Notion page: ${page_id}`);
718+
719+
const [url, md] = await getMarkdownForPage(page_id);
720+
if (!md) {
721+
logger.info(`No markdown for Notion page: ${page_id}`);
722+
return
723+
}
724+
725+
const chunks = await this.contentProcessor.chunkMarkdown(md, pageConfig, url);
726+
727+
// Process and store each chunk immediately
728+
await this.processAndStoreChunks(page_id, chunks, dbConnection, logger);
729+
730+
logger.debug(`Finished processing Notion page: ${page_id}`);
731+
}
732+
733+
const processDatabase = async (database_id: string): Promise<void> => {
734+
logger.info(`Processing Notion database: ${database_id}`);
735+
736+
let next_cursor = undefined;
737+
738+
try {
739+
do {
740+
const response = await notion.databases.query({
741+
database_id: database_id,
742+
start_cursor: next_cursor,
743+
filter: config.filter,
744+
});
745+
746+
for (const res of response.results) {
747+
if (res.object === "database") {
748+
await processDatabase(res.id);
749+
} else if (res.object === "page") {
750+
await processPage(res.id);
751+
} else {
752+
logger.error("unknown database object: ${res.object}");
753+
}
754+
}
755+
756+
next_cursor = response.next_cursor;
757+
} while (next_cursor)
758+
759+
logger.debug(`Finished processing Notion database: ${database_id}`);
760+
} catch (error) {
761+
logger.error(`Failed to process Notion database ${database_id}:`, error);
762+
}
763+
}
764+
765+
await processDatabase(config.database_id);
766+
767+
logger.info(`Completed processing Notion database: ${config.database_id}`);
768+
}
769+
659770
private async createEmbeddings(texts: string[]): Promise<number[][]> {
660771
const logger = this.logger.child('embeddings');
661772
try {

0 commit comments

Comments
 (0)