https://www.cnblogs.com/combfish/p/7830807.html
Found a more convenient annotation tools
https://github.com/jiesutd/SUTDAnnotator
, use python2 write interface is relatively lightweight, suitable for personal use. But if the group is used, or the web interface would be better.
Run Annotator_backup.py to give the following interface: open import file, select the words to be labeled, which has been set by pressing shortcut keys AV, corresponding to complete labeling
Add the name of the entity to modify their own needs: Enter the name of the entity the right shortcut keys corresponding button is pressed remap reassign shortcut keys corresponding entity name
Export results: export
Other operation is described in more detail in reference to github
The good mark ann files into common formats of training samples, the following have achieved python3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
import
re
# txt2ner_train_data turn label str into ner trainable data
# s :labeled str eg.'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华大学#SCHOOL*]'
# save_path: ner_trainable_txt name
def
str2ner_train_data(s,save_path):
ner_data
=
[]
result_1
=
re.finditer(r
'\[\@'
, s)
result_2
=
re.finditer(r
'\*\]'
, s)
begin
=
[]
end
=
[]
for
each
in
result_1:
begin.append(each.start())
for
each
in
result_2:
end.append(each.end())
assert
len
(begin)
=
=
len
(end)
i
=
0
j
=
0
while
i <
len
(s):
if
i
not
in
begin:
ner_data.append([s[i],
0
])
i
=
i
+
1
else
:
ann
=
s[i
+
2
:end[j]
-
2
]
entity, ner
=
ann.rsplit(
'#'
)
if
(
len
(entity)
=
=
1
):
ner_data.append([entity,
'S-'
+
ner])
else
:
if
(
len
(entity)
=
=
2
):
ner_data.append([entity[
0
],
'B-'
+
ner])
ner_data.append([entity[
1
],
'E-'
+
ner])
else
:
ner_data.append([entity[
0
],
'B-'
+
ner])
for
n
in
range
(
1
,
len
(entity)
-
1
):
ner_data.append([entity[n],
'I-'
+
ner])
ner_data.append([entity[
-
1
],
'E-'
+
ner])
i
=
end[j]
j
=
j
+
1
f
=
open
(save_path,
'w'
, encoding
=
'utf-8'
)
for
each
in
ner_data:
f.write(each[
0
]
+
' '
+
str
(each[
1
]))
f.write(
'\n'
)
f.close()
# txt2ner_train_data turn label str into ner trainable data
# file_path :labeled multi lines' txt eg.'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华大学#SCHOOL*]'
# save_path: ner_trainable_txt name
def
txt2ner_train_data(file_path,save_path):
fr
=
open
(file_path,
'r'
,encoding
=
'utf-8'
)
lines
=
fr.readlines()
s
=
''
for
line
in
lines:
line
=
line.replace(
'\n'
,'')
line
=
line.replace(
' '
,'')
s
=
s
+
line
fr.close()
str2ner_train_data(s, save_path)
if
(__name__
=
=
'__main__'
):
s
=
'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华大学#SCHOOL*]'
save_path
=
's.txt'
str2ner_train_data(s, save_path)
file_path
=
'D:\\codes\\python_codes\\SUTDAnnotator-master\\demotext\\ChineseDemo.txt.ann'
txt2ner_train_data(file_path,
's1.txt'
)
|