Skip to content

Commit cab499a

Browse files
committed
Handle USFM and Text corpora separately in pre-processing
1 parent c57e306 commit cab499a

1 file changed

Lines changed: 68 additions & 42 deletions

File tree

src/ServiceToolkit/src/SIL.ServiceToolkit/Services/ParallelCorpusService.cs

Lines changed: 68 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -153,54 +153,80 @@ public async Task PreprocessAsync(
153153
ignoreUsfmMarkers ??= [];
154154

155155
bool parallelTrainingDataPresent = false;
156-
List<Row> keyTermTrainingData = new();
156+
List<Row> keyTermTrainingData = [];
157+
158+
// Iterate over USFM and Text training corpora separately.
159+
// This is not only because they use different keys, but if we have text corpora
160+
// with scripture corpora, we don't want to exclude the text corpora from training.
161+
foreach (bool isScripture in new bool[] { true, false })
162+
{
163+
// Create source and target arrays of text corpora filtered for training
164+
// based on the filters specified in the associated monolingual corpora
165+
ITextCorpus[] sourceTrainingCorpora =
166+
[
167+
.. corpusBundle.SourceTextCorpora.SelectMany(c =>
168+
c.TextCorpora.Where(tc => isScripture ? tc.IsScripture() : !tc.IsScripture())
169+
.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
170+
),
171+
];
172+
173+
ITextCorpus[] targetTrainingCorpora =
174+
[
175+
.. corpusBundle.TargetTextCorpora.SelectMany(c =>
176+
c.TextCorpora.Where(tc => isScripture ? tc.IsScripture() : !tc.IsScripture())
177+
.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
178+
),
179+
];
180+
181+
// To support mixed source, collapse multiple source text corpora into one text corpus
182+
// by randomly interlacing content from each of the source text corpora
183+
ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed);
184+
if (sourceTrainingCorpus.IsScripture())
185+
{
186+
// Filter out all non-scripture; we only train on scripture content
187+
sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow);
188+
}
157189

158-
// Create source and target arrays of text corpora filtered for training
159-
// based on the filters specified in the associated monolingual corpora
160-
ITextCorpus[] sourceTrainingCorpora = corpusBundle
161-
.SourceTextCorpora.SelectMany(c =>
162-
c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
163-
)
164-
.ToArray();
190+
// Instead of interlacing rows from the target text corpora randomly, just take the
191+
// text row from the first target text corpus that has content for that row
192+
ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst();
193+
if (targetTrainingCorpus.IsScripture())
194+
{
195+
// Filter out all non-scripture; we only train on scripture content
196+
targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow);
197+
}
165198

166-
ITextCorpus[] targetTrainingCorpora = corpusBundle
167-
.TargetTextCorpora.SelectMany(c =>
168-
c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
169-
)
170-
.ToArray();
199+
// Align source and target training data
200+
ParallelTextRow[] trainingRows =
201+
[
202+
.. sourceTrainingCorpus.AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true),
203+
];
171204

172-
// To support mixed source, collapse multiple source text corpora into one text corpus
173-
// by randomly interlacing content from each of the source text corpora
174-
ITextCorpus sourceTrainingCorpus = sourceTrainingCorpora.ChooseRandom(Seed);
175-
if (sourceTrainingCorpus.IsScripture())
176-
{
177-
// Filter out all non-scripture; we only train on scripture content
178-
sourceTrainingCorpus = sourceTrainingCorpus.Where(IsScriptureRow);
205+
// After merging segments across ranges, run the 'train' preprocessing function
206+
// on each training row and record whether any parallel training data was present
207+
foreach (Row row in CollapseRanges(trainingRows))
208+
{
209+
await train(row, TrainingDataType.Text);
210+
if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
211+
{
212+
parallelTrainingDataPresent = true;
213+
}
214+
}
179215
}
180216

181-
// Instead of interlacing rows from the target text corpora randomly, just take the
182-
// text row from the first target text corpus that has content for that row
183-
ITextCorpus targetTrainingCorpus = targetTrainingCorpora.ChooseFirst();
184-
if (targetTrainingCorpus.IsScripture())
217+
// Get the target corpus from the training corpora.
218+
// This is across Scripture and non-Scripture corpora
219+
ITextCorpus[] targetCorpora =
220+
[
221+
.. corpusBundle.TargetTextCorpora.SelectMany(c =>
222+
c.TextCorpora.Select(tc => FilterTrainingCorpora(c.MonolingualCorpus, tc))
223+
),
224+
];
225+
ITextCorpus targetCorpus = targetCorpora.ChooseFirst();
226+
if (targetCorpus.IsScripture())
185227
{
186228
// Filter out all non-scripture; we only train on scripture content
187-
targetTrainingCorpus = targetTrainingCorpus.Where(IsScriptureRow);
188-
}
189-
190-
// Align source and target training data
191-
ParallelTextRow[] trainingRows = sourceTrainingCorpus
192-
.AlignRows(targetTrainingCorpus, allSourceRows: true, allTargetRows: true)
193-
.ToArray();
194-
195-
// After merging segments across ranges, run the 'train' preprocessing function
196-
// on each training row and record whether any parallel training data was present
197-
foreach (Row row in CollapseRanges(trainingRows))
198-
{
199-
await train(row, TrainingDataType.Text);
200-
if (!parallelTrainingDataPresent && row.SourceSegment.Length > 0 && row.TargetSegment.Length > 0)
201-
{
202-
parallelTrainingDataPresent = true;
203-
}
229+
targetCorpus = targetCorpus.Where(IsScriptureRow);
204230
}
205231

206232
if (useKeyTerms)
@@ -255,7 +281,7 @@ public async Task PreprocessAsync(
255281
{
256282
sourceInferencingCorpus,
257283
targetInferencingCorpus,
258-
targetTrainingCorpus,
284+
targetCorpus,
259285
}.AlignMany([true, false, false]);
260286

261287
foreach ((Row row, bool isInTrainingData) in CollapseInferencingRanges(inferencingCorpus.ToArray()))

0 commit comments

Comments
 (0)