forked from radixvinni/sqlite-morph
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_from_opencorpora.py
148 lines (127 loc) · 4.87 KB
/
gen_from_opencorpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""импорт словаря opencorpora из xml в sqlite"""
from __future__ import unicode_literals
from parse import load_json_or_xml_dict
import os
LEMMA_PREFIXES = ["", "по", "наи"]
def _join_lemmas(lemmas, links):
"""
Combine linked lemmas to single lemma.
<link_types>
<type id="1">ADJF-ADJS</type> ясен - ясный
<type id="2">ADJF-COMP</type> ясный - яснее
<type id="3">INFN-VERB</type> учить - учил
<type id="4">INFN-PRTF</type> учить - учив
<type id="5">INFN-GRND</type> учить - учивший
<type id="6">PRTF-PRTS</type>
<type id="7">NAME-PATR</type> имя - отчество
<type id="8">PATR_MASC-PATR_FEMN</type>
<type id="9">SURN_MASC-SURN_FEMN</type>
<type id="10">SURN_MASC-SURN_PLUR</type>
<type id="11">PERF-IMPF</type>
<type id="12">ADJF-SUPR_ejsh</type>
<type id="13">PATR_MASC_FORM-PATR_MASC_INFR</type>
<type id="14">PATR_FEMN_FORM-PATR_FEMN_INFR</type>
<type id="15">ADJF_eish-SUPR_nai_eish</type>
<type id="16">ADJF-SUPR_ajsh</type>
<type id="17">ADJF_aish-SUPR_nai_aish</type>
<type id="18">ADJF-SUPR_suppl</type>
<type id="19">ADJF-SUPR_nai</type>
<type id="20">ADJF-SUPR_slng</type>
</link_types>
"""
EXCLUDED_LINK_TYPES = set(['7','12'])
ALLOWED_LINK_TYPES = set(['1','2','3','8','9','10'])
moves = dict()
def move_lemma(from_id, to_id):
if str(from_id) not in lemmas: return
if str(to_id) not in lemmas: return
lm = lemmas[str(from_id)]
while to_id in moves:
to_id = moves[to_id]
lemmas[str(to_id)].extend(lm)
del lm[:]
moves[from_id] = to_id
for link_start, link_end, type_id in links:
if type_id in EXCLUDED_LINK_TYPES:
continue
if type_id not in ALLOWED_LINK_TYPES:
continue
move_lemma(link_end, link_start)
lemma_ids = sorted(lemmas.keys(), key=int)
return [lemmas[lemma_id] for lemma_id in lemma_ids if lemmas[lemma_id]]
def longest_common_substring(data):
"""
Return a longest common substring of a list of strings::
>>> longest_common_substring(["apricot", "rice", "cricket"])
'ric'
>>> longest_common_substring(["apricot", "banana"])
'a'
>>> longest_common_substring(["foo", "bar", "baz"])
''
See http://stackoverflow.com/questions/2892931/.
"""
substr = ''
if len(data) > 1 and len(data[0]) > 0:
for i in range(len(data[0])):
for j in range(len(data[0])-i+1):
if j > len(substr) and all(data[0][i:i+j] in x for x in data):
substr = data[0][i:i+j]
return substr
def _to_paradigm(lemma):
"""
Extract (stem, paradigm) pair from lemma list.
Paradigm is a list of suffixes with associated tags and prefixes.
"""
forms, tags = list(zip(*lemma))
prefixes = [''] * len(tags)
stem=forms[0]
if len(forms)>1:
stem = longest_common_substring(forms)
prefixes = [form[:form.index(stem)] for form in forms]
if any(pref not in LEMMA_PREFIXES for pref in prefixes):
stem = os.path.commonprefix(forms)
prefixes = [''] * len(tags)
suffixes = (
form[len(pref)+len(stem):]
for form, pref in zip(forms, prefixes)
)
return stem, tuple(zip(suffixes, tags, prefixes))
LEMMA_PREFIXES = ["", "по", "наи"]
print 'opening dict...'
parsed_dict = load_json_or_xml_dict('dict.opcorpora.xml')
print 'lemmas: ', len(parsed_dict.lemmas)
lemmas = _join_lemmas(parsed_dict.lemmas, parsed_dict.links)
#lemmas = parsed_dict.lemmas.values()
seen_tags = dict()
seen_paradigms = dict()
stems = []
tags = 0
paradigms = 0
for lemma in lemmas:
if len(lemma)>0:
stem, paradigm = _to_paradigm(lemma)
for suff, tag, pref in paradigm:
if tag not in seen_tags:
seen_tags[tag] = tags
tags += 1
if paradigm not in seen_paradigms:
seen_paradigms[paradigm] = paradigms
paradigms += 1
stems.append ((seen_paradigms[paradigm], stem));
print 'paradigms: ', paradigms
import sqlite3
conn = sqlite3.connect('opencorpora.sqlite')
conn.text_factory = str
conn.execute('CREATE TABLE form (rule integer, suffix text, tag text)')
conn.execute('CREATE TABLE stem (rule integer, prefix text)')
for para, rule in seen_paradigms.items():
for suff, tag, pref in para:
conn.execute('INSERT INTO form VALUES(?,?,?)', (rule,suff,tag))
for rule,stem in stems:
conn.execute('INSERT INTO stem VALUES(?,?)', (rule,stem))
conn.execute('CREATE TABLE norm(rule integer, suffix text, tag text)')
conn.execute('insert into norm select rule, suffix, tag from form f where rowid = (select rowid from form g where g.rule=f.rule limit 1)')
conn.commit()
conn.close()