diff --git a/.gitignore b/.gitignore index 000091a..1f47ff4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,99 @@ data/ *.pyc _* +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ +envbots/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +#IDE +.idea +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index 81a68ec..1ce6015 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ This project has two purposes. First of all, I'd like to share some of my experi * Go get various English word vectors [here](https://github.com/3Top/word2vec-api) if needed. ## Work Flow -* STEP 1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want. +* STEP 1-1. Download the [wikipedia database backup dumps](https://dumps.wikimedia.org/backup-index.html) of the language you want. +* STEP 1-2. Install requirements packages * STEP 2. Extract running texts to `data/` folder. * STEP 3. Run `build_corpus.py`. * STEP 4-1. Run `make_wordvector.sh` to get Word2Vec word vectors. diff --git a/build_corpus.py b/build_corpus.py index 0bf03e1..0033f91 100644 --- a/build_corpus.py +++ b/build_corpus.py @@ -1,5 +1,4 @@ # coding: utf-8 -#!/usr/bin/python2 import argparse import codecs import lxml.etree as ET @@ -10,26 +9,29 @@ parser = argparse.ArgumentParser() parser.add_argument('--lcode', help='ISO 639-1 code of target language. See `lcodes.txt`.') parser.add_argument('--max_corpus_size', type=int, default=1000000000, help='the maximum size of the corpus. Feel free to adjust it according to your computing power.') +parser.add_argument('--wiki_dump_version', help='version of wikimedia dumps') + args = parser.parse_args() lcode = args.lcode +wiki_dump_version = args.wiki_dump_version if lcode == 'ko': from konlpy.tag import Kkma # pip install konlpy. See http://konlpy.org/en/v0.4.4/ for further information. kkma = Kkma() - print "kkma succesfuly loaded!" + print("kkma succesfuly loaded!") elif lcode == 'ja': import MeCab # See https://pypi.python.org/pypi/mecab-python/0.996 mecab = MeCab.Tagger("-Owakati") - print "mecab succesfuly loaded!" + print("mecab succesfuly loaded!") elif lcode == 'zh': import jieba # See https://pypi.python.org/pypi/jieba/ - print "jieba succesfuly loaded!" + print("jieba succesfuly loaded!") elif lcode == 'vi': from pyvi.pyvi import ViTokenizer # See https://pypi.python.org/pypi/pyvi - print "pyvi succesfuly loaded!" + print("pyvi succesfuly loaded!") elif lcode == 'th': import pythai # See https://pypi.python.org/pypi/pythai - print "pythai succesfuly loaded!" + print("pythai succesfuly loaded!") # elif lcode == 'ar': # os.environ['CLASSPATH'] = "../stanford-segmenter-2015-12-09" # from nltk.tokenize.stanford_segmenter import StanfordSegmenter @@ -37,10 +39,10 @@ # path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data", # path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz", # path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz") -# print "StanfordSegmenter succesfuly loaded!" +# print ("StanfordSegmenter succesfuly loaded!") max_corpus_size = args.max_corpus_size -fname = "{}wiki-20161201-pages-articles-multistream.xml".format(lcode) +fname = "{}wiki-{}-pages-articles-multistream.xml".format(lcode, wiki_dump_version) def clean_text(text): global lcode @@ -157,7 +159,7 @@ def build_corpus(): continue # it's okay as we have a pretty big corpus! elem.clear() # We need to save memory! if i % 1000 == 0: - print i, + print(i), fsize = os.path.getsize("data/{}.txt".format(lcode)) if fsize > max_corpus_size: break @@ -166,4 +168,4 @@ def build_corpus(): if __name__ == "__main__": build_corpus() - print "Done" + print("Done") diff --git a/make_wordvectors.py b/make_wordvectors.py index 31c7cb6..6f41fa1 100644 --- a/make_wordvectors.py +++ b/make_wordvectors.py @@ -1,10 +1,9 @@ # coding: utf-8 -#!/usr/bin/python2 import nltk -import os import codecs import argparse import numpy as np +import sys # arguments setting parser = argparse.ArgumentParser() @@ -40,9 +39,13 @@ def get_min_count(sents): def make_wordvectors(): global lcode import gensim # In case you have difficulties installing gensim, you need to consider installing conda. - import cPickle as pickle + + if sys.version_info[0] >= 3: + import pickle + else: + import cPickle as pickle - print "Making sentences as list..." + print("Making sentences as list...") sents = [] with codecs.open('data/{}.txt'.format(lcode), 'r', 'utf-8') as fin: while 1: @@ -52,7 +55,7 @@ def make_wordvectors(): words = line.split() sents.append(words) - print "Making word vectors..." + print("Making word vectors...") min_count = get_min_count(sents) model = gensim.models.Word2Vec(sents, size=vector_size, min_count=min_count, negative=num_negative, @@ -62,11 +65,11 @@ def make_wordvectors(): # Save to file with codecs.open('data/{}.tsv'.format(lcode), 'w', 'utf-8') as fout: - for i, word in enumerate(model.index2word): + for i, word in enumerate(model.wv.index2word): fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'), np.array_str(model[word]) )) if __name__ == "__main__": make_wordvectors() - print "Done" + print("Done") diff --git a/make_wordvectors.sh b/make_wordvectors.sh old mode 100644 new mode 100755 index 56b1ebd..301b9f3 --- a/make_wordvectors.sh +++ b/make_wordvectors.sh @@ -2,26 +2,31 @@ #### Set your hyper-parameters here #### ############## START ################### -lcode="xx" # ISO 639-1 code of target language. See `lcodes.txt`. -max_corpus_size=1000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power. +lcode="id" # ISO 639-1 code of target language. See `lcodes.txt`. +wiki_dump_version="20180120" # version of wikimedia dumps +max_corpus_size=10000000000 # the maximum size of the corpus. Feel free to adjust it according to your computing power. vector_size=300 # the size of a word vector window_size=5 # the maximum distance between the current and predicted word within a sentence. vocab_size=20000 # the maximum vocabulary size num_negative=5 # the int for negative specifies how many “noise words” should be drawn ############## END ##################### -echo "step 0. Make `data` directory and move there.` +echo "step 0. Install packages according to requirements.txt" +pip install -r requirements.txt + +echo "step 0. Make `data` directory and move there." mkdir data; cd data echo "step 1. Download the stored wikipedia file to your disk." -wget "https://dumps.wikimedia.org/${lcode}wiki/20161201/${lcode}wiki-20161201-pages-articles-multistream.xml.bz2" +rm -rf ${lcode}wiki-${wiki_dump_version}-pages-articles-multistream* +wget "https://dumps.wikimedia.org/${lcode}wiki/${wiki_dump_version}/${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2" echo "step 2. Extract the bz2 file." -bzip2 -d "${lcode}wiki-20161201-pages-articles-multistream.xml.bz2" +bzip2 -d "${lcode}wiki-${wiki_dump_version}-pages-articles-multistream.xml.bz2" cd .. echo "step 3. Build Corpus." -python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size} +python build_corpus.py --lcode=${lcode} --max_corpus_size=${max_corpus_size} --wiki_dump_version=${wiki_dump_version} echo "step 4. make wordvectors" python make_wordvectors.py --lcode=${lcode} --vector_size=${vector_size} --window_size=${window_size} --vocab_size=${vocab_size} --num_negative=${num_negative} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..37d5493 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +boto==2.48.0 +boto3==1.5.20 +botocore==1.8.34 +bz2file==0.98 +certifi==2018.1.18 +chardet==3.0.4 +docutils==0.14 +gensim==3.2.0 +idna==2.6 +jmespath==0.9.3 +konlpy==0.4.4 +lxml==4.1.1 +nltk==3.2.5 +numpy==1.14.0 +python-crfsuite==0.9.5 +python-dateutil==2.6.1 +pyvi==0.0.8.0 +regex==2018.1.10 +requests==2.18.4 +s3transfer==0.1.12 +scikit-learn==0.19.1 +scipy==1.0.0 +six==1.11.0 +sklearn-crfsuite==0.3.6 +smart-open==1.5.6 +tabulate==0.8.2 +tqdm==4.19.5 +urllib3==1.22