@@ -12,13 +12,17 @@ import { Logger, LogLevel } from './logger';
1212import { Utils } from './utils' ;
1313import { DatabaseManager } from './database' ;
1414import { ContentProcessor } from './content-processor' ;
15+
16+ import { isFullPage , Client as NotionClient } from '@notionhq/client' ;
17+ import { NotionConverter } from 'notion-to-md' ;
18+
1519import {
1620 Config ,
17- SourceConfig ,
1821 GithubSourceConfig ,
1922 WebsiteSourceConfig ,
2023 LocalDirectorySourceConfig ,
2124 ZendeskSourceConfig ,
25+ NotionSourceConfig ,
2226 DatabaseConnection ,
2327 DocumentChunk
2428} from './types' ;
@@ -35,7 +39,6 @@ class Doc2Vec {
3539
3640 constructor ( configPath : string ) {
3741 this . logger = new Logger ( 'Doc2Vec' , {
38- level : LogLevel . DEBUG ,
3942 useTimestamp : true ,
4043 useColor : true ,
4144 prettyPrint : true
@@ -92,6 +95,8 @@ class Doc2Vec {
9295 await this . processLocalDirectory ( sourceConfig , sourceLogger ) ;
9396 } else if ( sourceConfig . type === 'zendesk' ) {
9497 await this . processZendesk ( sourceConfig , sourceLogger ) ;
98+ } else if ( sourceConfig . type === 'notion' ) {
99+ await this . processNotionDatabase ( sourceConfig , sourceLogger ) ;
95100 } else {
96101 sourceLogger . error ( `Unknown source type: ${ ( sourceConfig as any ) . type } ` ) ;
97102 }
@@ -656,6 +661,112 @@ class Doc2Vec {
656661 logger . info ( `Successfully processed ${ processedArticles } of ${ totalArticles } articles (filtered by date >= ${ startDate } )` ) ;
657662 }
658663
664+ private async processNotionDatabase ( config : NotionSourceConfig , parentLogger : Logger ) : Promise < void > {
665+ const logger = parentLogger . child ( 'process' ) ;
666+ logger . info ( `Starting processing for Notion: ${ config . database_id } ` ) ;
667+
668+ const dbConnection = await DatabaseManager . initDatabase ( config , logger ) ;
669+
670+ // Initialize metadata storage
671+ await DatabaseManager . initDatabaseMetadata ( dbConnection , logger ) ;
672+
673+ const notion = new NotionClient ( {
674+ auth : config . api_token ,
675+ } ) ;
676+
677+ // Create a NotionConverter instance
678+ const n2m = new NotionConverter ( notion ) ;
679+
680+ // Chunk the markdown content
681+ const pageConfig = {
682+ ...config ,
683+ product_name : config . product_name || 'notion' ,
684+ max_size : config . max_size || Infinity
685+ } ;
686+
687+ const getMarkdownForPage = async ( page_id : string ) : Promise < [ string , string ] > => {
688+ let pageUrl = '' ;
689+ try {
690+ logger . debug ( `Retrieving Notion page: ${ page_id } ` ) ;
691+
692+ // Retrieve the page and convert to markdown
693+ const page = await notion . pages . retrieve ( { page_id : page_id } ) ;
694+ if ( ! isFullPage ( page ) ) {
695+ logger . info ( `Skipping partial page #${ page_id } ` ) ;
696+ return [ '' , '' ] ;
697+ }
698+
699+ const nameProperty = page . properties . Name ;
700+ const pageTitle = nameProperty && nameProperty . type === 'title' && nameProperty . title ?. [ 0 ] ?. plain_text || 'Untitled' ;
701+
702+ pageUrl = page . url ;
703+
704+ logger . debug ( `Generating markdown for page: ${ pageUrl } ` ) ;
705+
706+ const md = await n2m . convert ( page_id ) ;
707+ const mdWithNoImages = md . content . replace ( / ! \[ .* ?\] \( .* ?\) [ \s \n ] * / g, '' ) ;
708+
709+ return [ page . url , `# ${ pageTitle } ${ mdWithNoImages } ` ] ;
710+ } catch ( error ) {
711+ logger . error ( `Failed to generate markdown for Notion page ${ page_id } (${ pageUrl } ):` , error ) ;
712+ return [ '' , '' ] ;
713+ }
714+ }
715+
716+ const processPage = async ( page_id : string ) : Promise < void > => {
717+ logger . info ( `Processing Notion page: ${ page_id } ` ) ;
718+
719+ const [ url , md ] = await getMarkdownForPage ( page_id ) ;
720+ if ( ! md ) {
721+ logger . info ( `No markdown for Notion page: ${ page_id } ` ) ;
722+ return
723+ }
724+
725+ const chunks = await this . contentProcessor . chunkMarkdown ( md , pageConfig , url ) ;
726+
727+ // Process and store each chunk immediately
728+ await this . processAndStoreChunks ( page_id , chunks , dbConnection , logger ) ;
729+
730+ logger . debug ( `Finished processing Notion page: ${ page_id } ` ) ;
731+ }
732+
733+ const processDatabase = async ( database_id : string ) : Promise < void > => {
734+ logger . info ( `Processing Notion database: ${ database_id } ` ) ;
735+
736+ let next_cursor = undefined ;
737+
738+ try {
739+ do {
740+ const response = await notion . databases . query ( {
741+ database_id : database_id ,
742+ start_cursor : next_cursor ,
743+ filter : config . filter ,
744+ } ) ;
745+
746+ for ( const res of response . results ) {
747+ if ( res . object === "database" ) {
748+ await processDatabase ( res . id ) ;
749+ } else if ( res . object === "page" ) {
750+ await processPage ( res . id ) ;
751+ } else {
752+ logger . error ( "unknown database object: ${res.object}" ) ;
753+ }
754+ }
755+
756+ next_cursor = response . next_cursor ;
757+ } while ( next_cursor )
758+
759+ logger . debug ( `Finished processing Notion database: ${ database_id } ` ) ;
760+ } catch ( error ) {
761+ logger . error ( `Failed to process Notion database ${ database_id } :` , error ) ;
762+ }
763+ }
764+
765+ await processDatabase ( config . database_id ) ;
766+
767+ logger . info ( `Completed processing Notion database: ${ config . database_id } ` ) ;
768+ }
769+
659770 private async createEmbeddings ( texts : string [ ] ) : Promise < number [ ] [ ] > {
660771 const logger = this . logger . child ( 'embeddings' ) ;
661772 try {
0 commit comments