Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Amdework
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#Python Code for for preprocessing Ge'ez character
##Printing a unicode character with thier code
###unicode_string = "\u2665\u00C4\u00C6" # output= ♥ÄÆ
----
##### This python code removes punctuations, non Geez and special characters ######
"""Lowercase, trim, and remove non-letter characters (from pytorch)"""
def normalizeString(s):
#s = re.sub(r"።(?=\u1200-\u137c)", r"", s) #right
s = re.sub(r"\s(?=።)", r"", s)
s = re.sub(r"([.!?፣፤፥፡።])", r"", s)
s = re.sub(r"[^\u1200-\u137c\s\d]", r"", s)
return s

#unicode_string = "\u1200-\u137c"
#print(unicode_string)
print(normalizeString(f"the quotation ወገብረ እግዚአብሔር ለአዳም ወለብእሲቱ አዕዳለ ዘማእስ ወአልበሶሙ from bible፻፲፱ 119። "))
print(normalizeString(f" ። ከእርሱም ፈቀቅ አለ። የዚያን ጊዜ። ስለI'm !!To 2334 he's እግዚአብሔርም ። you re avoid!!! t!!! his error, make sure your .tloook.: ."))
print(normalizeString(f"እ1234ግዚአብሔርም ። ብ!!ርሃን ይሁን፣፤፥፡ ኣ?ለ፤ ብርሃን456ም!! ሆነ ። ወኮነ ብርሃን ። እንt56ደ ሆነ፣፤፥፡ አየ፤ እግዚብሔርም ብርሃንንና ወማ ። "))
print(normalizeString(f" ወማ ። ፻፲፱ ። መላጣI'm !!To 2334 he's እግዚአብሔርም ። you re avoid!!! t!!! his error, make sure your .tloook.: ."))