Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions src/Base/CasedTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
using BERTTokenizers.Extensions;
using System;
using System.Collections.Generic;
using System.Linq;

namespace BERTTokenizers.Base
namespace BERTTokenizers.Base
{
public abstract class CasedTokenizer : TokenizerBase
{
protected CasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) { }

protected override IEnumerable<string> TokenizeSentence(string text)
{
return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None)
.SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()));
}
}
}
11 changes: 8 additions & 3 deletions src/Base/TokenizerBase.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using BERTTokenizers.Helpers;
using BERTTokenizers.Extensions;
using BERTTokenizers.Helpers;
using System;
using System.Collections.Generic;
using System.Linq;
Expand Down Expand Up @@ -87,6 +88,12 @@ public List<string> Untokenize(List<string> tokens)
=> (tokenindex.Token, tokenindex.VocabularyIndex, segmentindex)).ToList();
}

protected virtual IEnumerable<string> TokenizeSentence(string text)
{
return text.Split(new char[0], StringSplitOptions.RemoveEmptyEntries)
.SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()));
}

private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)
{
var segmentIndex = 0;
Expand Down Expand Up @@ -152,7 +159,5 @@ private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)

return tokens;
}

protected abstract IEnumerable<string> TokenizeSentence(string text);
}
}
8 changes: 2 additions & 6 deletions src/Base/UncasedTokenizer.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using BERTTokenizers.Extensions;
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Linq;

namespace BERTTokenizers.Base
Expand All @@ -13,9 +11,7 @@ protected UncasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath)

protected override IEnumerable<string> TokenizeSentence(string text)
{
return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None)
.SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()))
.Select(o => o.ToLower());
return base.TokenizeSentence(text).Select(o => o.ToLower());
}
}
}
2 changes: 1 addition & 1 deletion tests/BERTTokenizers.Tests.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net7.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
Expand Down
15 changes: 14 additions & 1 deletion tests/BertBaseTokenizerUncasedShould.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,20 @@ public void Tokenize_sentence()
Assert.Equal(("love", 2293, 0), tokens[2]);
Assert.Equal(("you", 2017, 0), tokens[3]);
Assert.Equal(("[SEP]", 102, 0), tokens[4]);
}

[Fact]
public void Tokenize_text_with_linux_line_endings()
{
var sentence = "Linux\nline\nendings";

var tokens = _tokenizer.Tokenize(sentence);
Assert.Equal(5, tokens.Count);
Assert.Equal(("[CLS]", 101, 0), tokens[0]);
Assert.Equal(("linux", 11603, 0), tokens[1]);
Assert.Equal(("line", 2240, 0), tokens[2]);
Assert.Equal(("endings", 21306, 0), tokens[3]);
Assert.Equal(("[SEP]", 102, 0), tokens[4]);
}

[Fact]
Expand Down Expand Up @@ -61,7 +74,7 @@ public void Encode_sentence()
}

[Fact]
public void Unokenize_sentence()
public void Untokenize_sentence()
{
var tokens = new List<string>(){ "she", "##s" };

Expand Down