-
Notifications
You must be signed in to change notification settings - Fork 1
/
tag-lemmatize.py
49 lines (39 loc) · 1.16 KB
/
tag-lemmatize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 10 16:12:07 2016
@author: ktt
https://github.com/KT12
"""
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# WordNet only cares about 5 parts of speech.
# The other parts of speech will be tagged as nouns.
part = {
'N' : 'n',
'V' : 'v',
'J' : 'a',
'S' : 's',
'R' : 'r'
}
wnl = WordNetLemmatizer()
def convert_tag(penn_tag):
'''
convert_tag() accepts the **first letter** of a Penn part-of-speech tag,
then uses a dict lookup to convert it to the appropriate WordNet tag.
'''
if penn_tag in part.keys():
return part[penn_tag]
else:
# other parts of speech will be tagged as nouns
return 'n'
def tag_and_lem(element):
'''
tag_and_lem() accepts a string, tokenizes, tags, converts tags,
lemmatizes, and returns a string
'''
# list of tuples [('token', 'tag'), ('token2', 'tag2')...]
sent = pos_tag(word_tokenize(element)) # must tag in context
return ' '.join([wnl.lemmatize(sent[k][0], convert_tag(sent[k][1][0]))
for k in range(len(sent))])