@@ -153,54 +153,80 @@ public async Task PreprocessAsync(
153153 ignoreUsfmMarkers ??= [ ] ;
154154
155155 bool parallelTrainingDataPresent = false ;
156- List < Row > keyTermTrainingData = new ( ) ;
156+ List < Row > keyTermTrainingData = [ ] ;
157+
158+ // Iterate over USFM and Text training corpora separately.
159+ // This is not only because they use different keys, but if we have text corpora
160+ // with scripture corpora, we don't want to exclude the text corpora from training.
161+ foreach ( bool isScripture in new bool [ ] { true , false } )
162+ {
163+ // Create source and target arrays of text corpora filtered for training
164+ // based on the filters specified in the associated monolingual corpora
165+ ITextCorpus [ ] sourceTrainingCorpora =
166+ [
167+ .. corpusBundle . SourceTextCorpora . SelectMany ( c =>
168+ c . TextCorpora . Where ( tc => isScripture ? tc . IsScripture ( ) : ! tc . IsScripture ( ) )
169+ . Select ( tc => FilterTrainingCorpora ( c . MonolingualCorpus , tc ) )
170+ ) ,
171+ ] ;
172+
173+ ITextCorpus [ ] targetTrainingCorpora =
174+ [
175+ .. corpusBundle . TargetTextCorpora . SelectMany ( c =>
176+ c . TextCorpora . Where ( tc => isScripture ? tc . IsScripture ( ) : ! tc . IsScripture ( ) )
177+ . Select ( tc => FilterTrainingCorpora ( c . MonolingualCorpus , tc ) )
178+ ) ,
179+ ] ;
180+
181+ // To support mixed source, collapse multiple source text corpora into one text corpus
182+ // by randomly interlacing content from each of the source text corpora
183+ ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora . ChooseRandom ( Seed ) ;
184+ if ( sourceTrainingCorpus . IsScripture ( ) )
185+ {
186+ // Filter out all non-scripture; we only train on scripture content
187+ sourceTrainingCorpus = sourceTrainingCorpus . Where ( IsScriptureRow ) ;
188+ }
157189
158- // Create source and target arrays of text corpora filtered for training
159- // based on the filters specified in the associated monolingual corpora
160- ITextCorpus [ ] sourceTrainingCorpora = corpusBundle
161- . SourceTextCorpora . SelectMany ( c =>
162- c . TextCorpora . Select ( tc => FilterTrainingCorpora ( c . MonolingualCorpus , tc ) )
163- )
164- . ToArray ( ) ;
190+ // Instead of interlacing rows from the target text corpora randomly, just take the
191+ // text row from the first target text corpus that has content for that row
192+ ITextCorpus targetTrainingCorpus = targetTrainingCorpora . ChooseFirst ( ) ;
193+ if ( targetTrainingCorpus . IsScripture ( ) )
194+ {
195+ // Filter out all non-scripture; we only train on scripture content
196+ targetTrainingCorpus = targetTrainingCorpus . Where ( IsScriptureRow ) ;
197+ }
165198
166- ITextCorpus [ ] targetTrainingCorpora = corpusBundle
167- . TargetTextCorpora . SelectMany ( c =>
168- c . TextCorpora . Select ( tc => FilterTrainingCorpora ( c . MonolingualCorpus , tc ) )
169- )
170- . ToArray ( ) ;
199+ // Align source and target training data
200+ ParallelTextRow [ ] trainingRows =
201+ [
202+ .. sourceTrainingCorpus . AlignRows ( targetTrainingCorpus , allSourceRows : true , allTargetRows : true ) ,
203+ ] ;
171204
172- // To support mixed source, collapse multiple source text corpora into one text corpus
173- // by randomly interlacing content from each of the source text corpora
174- ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora . ChooseRandom ( Seed ) ;
175- if ( sourceTrainingCorpus . IsScripture ( ) )
176- {
177- // Filter out all non-scripture; we only train on scripture content
178- sourceTrainingCorpus = sourceTrainingCorpus . Where ( IsScriptureRow ) ;
205+ // After merging segments across ranges, run the 'train' preprocessing function
206+ // on each training row and record whether any parallel training data was present
207+ foreach ( Row row in CollapseRanges ( trainingRows ) )
208+ {
209+ await train ( row , TrainingDataType . Text ) ;
210+ if ( ! parallelTrainingDataPresent && row . SourceSegment . Length > 0 && row . TargetSegment . Length > 0 )
211+ {
212+ parallelTrainingDataPresent = true ;
213+ }
214+ }
179215 }
180216
181- // Instead of interlacing rows from the target text corpora randomly, just take the
182- // text row from the first target text corpus that has content for that row
183- ITextCorpus targetTrainingCorpus = targetTrainingCorpora . ChooseFirst ( ) ;
184- if ( targetTrainingCorpus . IsScripture ( ) )
217+ // Get the target corpus from the training corpora.
218+ // This is across Scripture and non-Scripture corpora
219+ ITextCorpus [ ] targetCorpora =
220+ [
221+ .. corpusBundle . TargetTextCorpora . SelectMany ( c =>
222+ c . TextCorpora . Select ( tc => FilterTrainingCorpora ( c . MonolingualCorpus , tc ) )
223+ ) ,
224+ ] ;
225+ ITextCorpus targetCorpus = targetCorpora . ChooseFirst ( ) ;
226+ if ( targetCorpus . IsScripture ( ) )
185227 {
186228 // Filter out all non-scripture; we only train on scripture content
187- targetTrainingCorpus = targetTrainingCorpus . Where ( IsScriptureRow ) ;
188- }
189-
190- // Align source and target training data
191- ParallelTextRow [ ] trainingRows = sourceTrainingCorpus
192- . AlignRows ( targetTrainingCorpus , allSourceRows : true , allTargetRows : true )
193- . ToArray ( ) ;
194-
195- // After merging segments across ranges, run the 'train' preprocessing function
196- // on each training row and record whether any parallel training data was present
197- foreach ( Row row in CollapseRanges ( trainingRows ) )
198- {
199- await train ( row , TrainingDataType . Text ) ;
200- if ( ! parallelTrainingDataPresent && row . SourceSegment . Length > 0 && row . TargetSegment . Length > 0 )
201- {
202- parallelTrainingDataPresent = true ;
203- }
229+ targetCorpus = targetCorpus . Where ( IsScriptureRow ) ;
204230 }
205231
206232 if ( useKeyTerms )
@@ -255,7 +281,7 @@ public async Task PreprocessAsync(
255281 {
256282 sourceInferencingCorpus ,
257283 targetInferencingCorpus ,
258- targetTrainingCorpus ,
284+ targetCorpus ,
259285 } . AlignMany ( [ true , false , false ] ) ;
260286
261287 foreach ( ( Row row , bool isInTrainingData ) in CollapseInferencingRanges ( inferencingCorpus . ToArray ( ) ) )
0 commit comments