importpandas as pd
importjson
defgen_ner_data(file_path, save_path):
"""
file_path: 通过Label Studio导出的csv文件
save_path: 保存的路径
"""
data =pd.read_csv(file_path,encoding="utf-8-sig")
for idx, item in data.iterrows():
text = item['text']
if pd.isna(text):
text = ''
text_list = list(text)
label_list = []
labels = item['label']
label_list = ['O' for i inrange(len(text_list))]
if pd.isna(labels):
pass
else:
labels = json.loads(labels)
for label_item in labels:
start = label_item['start']
end = label_item['end']
label = label_item['labels'][0]
label_list[start] = f'B-{label}'
label_list[start+1:end-1] =[f'M-{label}' for i in range(end-start-2)]
if end-1 != start:
label_list[end - 1] =f'E-{label}'
assert len(label_list) == len(text_list)
with open(save_path,'a',encoding="utf-8") as f:
res=''
for idx_, line inenumerate(text_list):
if text_list[idx_] == '\t' ortext_list[idx_] == ' ':
text_list[idx_] = ','
res = res + label_list[idx_] +','
if len(res) > 0:
res = res[:-1]
f.write('[' + res +']')
f.write('\n')