Files: prepare.py

File prepare.py, 1.9 kB (added by Blackhex, 8 months ago)

Merges OPUS and AC corpora.

Line 
1 #!/bin/env python
2
3 cze_opus_file = open('cze-eng-opus.cze', 'r')
4 cze_ac_file = open('cze-eng-ac.cze', 'r')
5 cze_file = open('cze-eng.cze', 'w+')
6
7 cze_opus_lines = cze_opus_file.readlines()
8 cze_ac_lines = cze_ac_file.readlines()
9 cze_opus_line_count = len(cze_opus_lines)
10 cze_ac_line_count = len(cze_ac_lines)
11
12 print cze_opus_line_count
13 print cze_ac_line_count
14
15 for line in cze_opus_lines[:int(0.9 * cze_opus_line_count)]:
16     cze_file.write(line)
17    
18 for line in cze_ac_lines[:int(0.9 * cze_ac_line_count)]:
19     cze_file.write(line)
20
21 for line in cze_opus_lines[int(0.9 * cze_opus_line_count):int(0.95 * cze_opus_line_count)]:
22     cze_file.write(line)
23    
24 for line in cze_ac_lines[int(0.9 * cze_ac_line_count):int(0.95 * cze_ac_line_count)]:
25     cze_file.write(line)
26
27 for line in cze_opus_lines[int(0.95 * cze_opus_line_count):]:
28     cze_file.write(line)
29
30 for line in cze_ac_lines[int(0.95 * cze_ac_line_count):]:
31     cze_file.write(line)
32
33 cze_opus_file.close()
34 cze_ac_file.close()
35 cze_file.close()
36
37 eng_opus_file = open('cze-eng-opus.eng', 'r')
38 eng_ac_file = open('cze-eng-ac.eng', 'r')
39 eng_file = open('cze-eng.eng', 'w+')
40
41 eng_opus_lines = eng_opus_file.readlines()
42 eng_ac_lines = eng_ac_file.readlines()
43 eng_opus_line_count = len(eng_opus_lines)
44 eng_ac_line_count = len(eng_ac_lines)
45
46 for line in eng_opus_lines[:int(0.9 * eng_opus_line_count)]:
47     eng_file.write(line)
48
49 for line in eng_ac_lines[:int(0.9 * eng_ac_line_count)]:
50     eng_file.write(line)
51    
52 for line in eng_opus_lines[int(0.9 * eng_opus_line_count):int(0.95 * eng_opus_line_count)]:
53     eng_file.write(line)
54
55 for line in eng_ac_lines[int(0.9 * eng_ac_line_count):int(0.95 * eng_ac_line_count)]:
56     eng_file.write(line)
57
58 for line in eng_opus_lines[int(0.95 * eng_opus_line_count):]:
59     eng_file.write(line)
60
61 for line in eng_ac_lines[int(0.95 * eng_ac_line_count):]:
62     eng_file.write(line)
63
64 eng_opus_file.close()
65 eng_ac_file.close()
66 eng_file.close()
67
68