| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
import os, sys, re |
|---|
| 4 |
|
|---|
| 5 |
if len(sys.argv) == 2: |
|---|
| 6 |
steps = eval(sys.argv[1]) |
|---|
| 7 |
else: |
|---|
| 8 |
steps = range(1, 9) |
|---|
| 9 |
|
|---|
| 10 |
lang_re = re.compile('^(\w*)-(\w*)$') |
|---|
| 11 |
prefix = '/pub/tmp/xbarto33' |
|---|
| 12 |
bin_prefix = os.path.join(prefix, 'bin') |
|---|
| 13 |
corpus_prefix = os.path.join(prefix, 'opus') |
|---|
| 14 |
languages = ['cze-eng'] |
|---|
| 15 |
|
|---|
| 16 |
def tokenize(file): |
|---|
| 17 |
command = '%s -l en < %s > tmp' % (os.path.join(bin_prefix, |
|---|
| 18 |
'tokenizer.perl'), file) |
|---|
| 19 |
print command |
|---|
| 20 |
os.system(command) |
|---|
| 21 |
command = 'mv tmp %s' % (file,) |
|---|
| 22 |
print command |
|---|
| 23 |
os.system(command) |
|---|
| 24 |
|
|---|
| 25 |
def detokenize(file): |
|---|
| 26 |
command = '%s -l en < %s > tmp' % (os.path.join(bin_prefix, |
|---|
| 27 |
'tokenizer.perl'), file) |
|---|
| 28 |
print command |
|---|
| 29 |
os.system(command) |
|---|
| 30 |
command = 'mv tmp %s' % (file,) |
|---|
| 31 |
print command |
|---|
| 32 |
os.system(command) |
|---|
| 33 |
|
|---|
| 34 |
def clean(corpus, lang1, lang2): |
|---|
| 35 |
command = '%s %s %s %s tmp 1 50' % (os.path.join(bin_prefix, |
|---|
| 36 |
'clean-corpus-n.perl'), corpus, lang1, lang2) |
|---|
| 37 |
print command |
|---|
| 38 |
os.system(command) |
|---|
| 39 |
command = 'mv tmp.%s %s.%s' % (lang1, corpus, lang1) |
|---|
| 40 |
print command |
|---|
| 41 |
os.system(command) |
|---|
| 42 |
command = 'mv tmp.%s %s.%s' % (lang2, corpus, lang2) |
|---|
| 43 |
print command |
|---|
| 44 |
os.system(command) |
|---|
| 45 |
|
|---|
| 46 |
for lang in languages: |
|---|
| 47 |
match = lang_re.match(lang) |
|---|
| 48 |
if match: |
|---|
| 49 |
lang1 = match.group(1) |
|---|
| 50 |
lang2 = match.group(2) |
|---|
| 51 |
|
|---|
| 52 |
|
|---|
| 53 |
if 1 in steps: |
|---|
| 54 |
lang1_file = open('%s.%s' % (lang, lang1), 'r') |
|---|
| 55 |
line_count = 0 |
|---|
| 56 |
for line in lang1_file.readlines(): |
|---|
| 57 |
line_count = line_count + 1 |
|---|
| 58 |
lang1_file.close() |
|---|
| 59 |
|
|---|
| 60 |
devel_index = int(0.90 * line_count) |
|---|
| 61 |
test_index = int(0.95 * line_count) |
|---|
| 62 |
|
|---|
| 63 |
lang1_file = open('%s.%s' % (lang, lang1), 'r') |
|---|
| 64 |
lang2_file = open('%s.%s' % (lang, lang2), 'r') |
|---|
| 65 |
|
|---|
| 66 |
|
|---|
| 67 |
out1_file = open('%s-train.%s' % (lang, lang1), 'w+') |
|---|
| 68 |
out2_file = open('%s-train.%s' % (lang, lang2), 'w+') |
|---|
| 69 |
for I in xrange(0, devel_index): |
|---|
| 70 |
out1_file.write(lang1_file.readline()) |
|---|
| 71 |
out2_file.write(lang2_file.readline()) |
|---|
| 72 |
out1_file.close() |
|---|
| 73 |
out2_file.close() |
|---|
| 74 |
|
|---|
| 75 |
|
|---|
| 76 |
out1_file = open('%s-devel.%s' % (lang, lang1), 'w+') |
|---|
| 77 |
out2_file = open('%s-devel.%s' % (lang, lang2), 'w+') |
|---|
| 78 |
for I in xrange(devel_index, test_index): |
|---|
| 79 |
out1_file.write(lang1_file.readline()) |
|---|
| 80 |
out2_file.write(lang2_file.readline()) |
|---|
| 81 |
out1_file.close() |
|---|
| 82 |
out2_file.close() |
|---|
| 83 |
|
|---|
| 84 |
|
|---|
| 85 |
out1_file = open('%s-test.%s' % (lang, lang1), 'w+') |
|---|
| 86 |
out2_file = open('%s-test.%s' % (lang, lang2), 'w+') |
|---|
| 87 |
for I in xrange(test_index, line_count): |
|---|
| 88 |
out1_file.write(lang1_file.readline()) |
|---|
| 89 |
out2_file.write(lang2_file.readline()) |
|---|
| 90 |
out1_file.close() |
|---|
| 91 |
out2_file.close() |
|---|
| 92 |
|
|---|
| 93 |
lang1_file.close() |
|---|
| 94 |
lang2_file.close() |
|---|
| 95 |
|
|---|
| 96 |
|
|---|
| 97 |
if 2 in steps: |
|---|
| 98 |
tokenize('%s-train.%s' % (lang, lang1)) |
|---|
| 99 |
tokenize('%s-train.%s' % (lang, lang2)) |
|---|
| 100 |
tokenize('%s-devel.%s' % (lang, lang1)) |
|---|
| 101 |
tokenize('%s-devel.%s' % (lang, lang2)) |
|---|
| 102 |
tokenize('%s-test.%s' % (lang, lang1)) |
|---|
| 103 |
tokenize('%s-test.%s' % (lang, lang2)) |
|---|
| 104 |
|
|---|
| 105 |
|
|---|
| 106 |
if 3 in steps: |
|---|
| 107 |
clean('%s-train' % (lang,), lang1, lang2) |
|---|
| 108 |
clean('%s-devel' % (lang,), lang1, lang2) |
|---|
| 109 |
clean('%s-test' % (lang,), lang1, lang2) |
|---|
| 110 |
|
|---|
| 111 |
|
|---|
| 112 |
if 4 in steps: |
|---|
| 113 |
command = '%s -order 5 -interpolate -kndiscount -text %s-train.%s' \ |
|---|
| 114 |
' -lm %s.%s.lm' % (os.path.join(bin_prefix, 'ngram-count'), |
|---|
| 115 |
lang, lang2, lang, lang2) |
|---|
| 116 |
print command |
|---|
| 117 |
os.system(command) |
|---|
| 118 |
command = '%s -order 5 -interpolate -kndiscount -text %s-train.%s' \ |
|---|
| 119 |
' -lm %s.%s.lm' % (os.path.join(bin_prefix, 'ngram-count'), |
|---|
| 120 |
lang, lang1, lang, lang1) |
|---|
| 121 |
print command |
|---|
| 122 |
os.system(command) |
|---|
| 123 |
|
|---|
| 124 |
|
|---|
| 125 |
if 5 in steps: |
|---|
| 126 |
command = '%s -scripts-root-dir %s -root-dir %s -corpus %s-train' \ |
|---|
| 127 |
' -f %s -e %s -alignment grow-diag-final-and -reordering' \ |
|---|
| 128 |
' msd-bidirectional-fe -lm 0:5:%s.%s.lm:0 -parallel' % \ |
|---|
| 129 |
(os.path.join(bin_prefix, 'training', 'train-factored-phrase-model.perl'), |
|---|
| 130 |
bin_prefix, os.path.join(corpus_prefix, 'working-dir'), lang, |
|---|
| 131 |
lang1, lang2, os.path.join(os.getcwd(), lang, lang2)) |
|---|
| 132 |
print command |
|---|
| 133 |
os.system(command) |
|---|
| 134 |
|
|---|
| 135 |
|
|---|
| 136 |
if 6 in steps: |
|---|
| 137 |
command = '%s %s-devel.%s %s-devel.%s %s %s --working-dir %s' \ |
|---|
| 138 |
' --rootdir %s' % (os.path.join(bin_prefix, 'training', |
|---|
| 139 |
'mert-moses.pl'), lang, lang1, lang, lang2, |
|---|
| 140 |
os.path.join(bin_prefix, 'moses'), os.path.join(corpus_prefix, |
|---|
| 141 |
'working-dir', 'model', 'moses.ini'), os.path.join(corpus_prefix, |
|---|
| 142 |
'working-dir', 'tuning'), bin_prefix) |
|---|
| 143 |
print command |
|---|
| 144 |
os.system(command) |
|---|
| 145 |
command = '%s %s < %s > %s' % (os.path.join(bin_prefix, |
|---|
| 146 |
'reuse-weights.perl'), os.path.join(corpus_prefix, 'working-dir', |
|---|
| 147 |
'tuning', 'moses.ini'), os.path.join(corpus_prefix, 'working-dir', |
|---|
| 148 |
'model', 'moses.ini'), os.path.join(corpus_prefix, 'working-dir', |
|---|
| 149 |
'model', 'moses-tuned.ini')) |
|---|
| 150 |
print command |
|---|
| 151 |
os.system(command) |
|---|
| 152 |
|
|---|
| 153 |
|
|---|
| 154 |
if 7 in steps: |
|---|
| 155 |
command = '%s -f %s -input-file %s-test.%s > %s-translated-normal.%s' \ |
|---|
| 156 |
% (os.path.join(bin_prefix, 'moses'), os.path.join(corpus_prefix, |
|---|
| 157 |
'working-dir', 'model', 'moses.ini'), lang, lang1, lang, lang2) |
|---|
| 158 |
print command |
|---|
| 159 |
os.system(command) |
|---|
| 160 |
command = '%s -f %s -input-file %s-test.%s > %s-translated-tuned.%s' \ |
|---|
| 161 |
% (os.path.join(bin_prefix, 'moses'), os.path.join(corpus_prefix, |
|---|
| 162 |
'working-dir', 'model', 'moses-tuned.ini'), lang, lang1, lang, lang2) |
|---|
| 163 |
print command |
|---|
| 164 |
os.system(command) |
|---|
| 165 |
|
|---|
| 166 |
|
|---|
| 167 |
if 8 in steps: |
|---|
| 168 |
command = '%s %s-test.%s < %s-translated-normal.%s' % ( |
|---|
| 169 |
os.path.join(bin_prefix, 'multi-bleu.perl'), lang, lang2, lang, lang2) |
|---|
| 170 |
print command |
|---|
| 171 |
os.system(command) |
|---|
| 172 |
command = '%s %s-test.%s < %s-translated-tuned.%s' % ( |
|---|
| 173 |
os.path.join(bin_prefix, 'multi-bleu.perl'), lang, lang2, lang, lang2) |
|---|
| 174 |
print command |
|---|
| 175 |
os.system(command) |
|---|
| 176 |
|
|---|