这个问题似乎是一个多类多标签问题。提问者似乎很乐意构建详细的本体。这些导致作者提出以下方法。请注意,可以在此处的文章中找到对此的详细说明。
解决问题的步骤:
- 将分类文件构建为 csv 文件,如下所示。请注意,列标题应与下面给出的相同。
- 将所有内容放在另一个 csv 文件中,如下所示。请注意,列标题应与下面给出的相同。
- 在以下 python 代码中,请在df的路径中输入内容的路径,在df_tx的路径中输入分类的路径。这些步骤出现在用于映射的评论导入数据附近。在代码末尾为输出添加另一个路径值。
运行下面的python代码。请注意,此代码在 Windows 10 机器上的 Python 2.7 上运行良好。请自行解决任何技术问题,因为作者可能对此类问题没有太大帮助。
#Invoke Libraries
import pandas as pd
import numpy as np
import re
#import data for mapping
df = pd.read_csv("path to content csv");
df_tx = pd.read_csv("path to taxonomy csv");
#Build functions
#function that identifies taxonomy words ending with (*) and treats it as a wild character
def asterix_handler(asterixw, lookupw):
mtch = "F"
for word in asterixw:
for lword in lookupw:
if(word[-1:]=="*"):
if(bool(re.search("^"+ word[:-1],lword))==True):
mtch = "T"
break
return(mtch)
#function that removes all punctuations. helps in creation of set of words
def remov_punct(withpunct):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
without_punct = ""
char = 'nan'
for char in withpunct:
if char not in punctuations:
without_punct = without_punct + char
return(without_punct)
#function to remove just the quotes(""). This is for the taxonomy
def remov_quote(withquote):
quote = '"'
without_quote = ""
char = 'nan'
for char in withquote:
if char not in quote:
without_quote = without_quote + char
return(without_quote)
#split each document by sentences and append one below the other for sentence level categorization and sentiment mapping
sentence_data = pd.DataFrame(columns=['slno','text'])
for d in range(len(df)):
doc = (df.iloc[d,1].split('.'))
for s in ((doc)):
temp = {'slno': [df['slno'][d]], 'text': [s]}
sentence_data = pd.concat([sentence_data,pd.DataFrame(temp)])
temp = ""
#drop empty text rows and export data
sentence_data['text'].replace('',np.nan,inplace=True);
sentence_data.dropna(subset=['text'], inplace=True);
data = sentence_data
cat2list = list(set(df_tx['Category2']))
data['Category'] = 0
mapped_data = pd.DataFrame(columns = ['slno','text','Category']);
temp=pd.DataFrame()
for k in range(len(data)):
comment = remov_punct(data.iloc[k,1])
data_words = [str(x.strip()).lower() for x in str(comment).split()]
data_words = filter(None, data_words)
output = []
for l in range(len(df_tx)):
key_flag = False
and_flag = False
not_flag = False
if (str(df_tx['Keywords'][l])!='nan'):
kw_clean = (remov_quote(df_tx['Keywords'][l]))
if (str(df_tx['AndWords'][l])!='nan'):
aw_clean = (remov_quote(df_tx['AndWords'][l]))
else:
aw_clean = df_tx['AndWords'][l]
if (str(df_tx['NotWords'][l])!='nan'):
nw_clean = remov_quote(df_tx['NotWords'][l])
else:
nw_clean = df_tx['NotWords'][l]
Key_words = 'nan'
and_words = 'nan'
and_words2 = 'nan'
not_words = 'nan'
not_words2 = 'nan'
if(str(kw_clean)!='nan'):
key_words = [str(x.strip()).lower() for x in kw_clean.split(',')]
key_words2 = set(w.lower() for w in key_words)
if(str(aw_clean)!='nan'):
and_words = [str(x.strip()).lower() for x in aw_clean.split(',')]
and_words2 = set(w.lower() for w in and_words)
if(str(nw_clean)!= 'nan'):
not_words = [str(x.strip()).lower() for x in nw_clean.split(',')]
not_words2 = set(w.lower() for w in not_words)
if(str(kw_clean) == 'nan'):
key_flag = False
else:
if set(data_words) & key_words2:
key_flag = True
elif(bool(re.search('"',df_tx['Keywords'][l]))==True and quote_handler(key_words, comment) == 'T'):
key_flag = True
elif(asterix_handler(key_words2, data_words)=='T'):
key_flag = True
if(str(aw_clean)=='nan'):
and_flag = True
else:
if set(data_words) & and_words2:
and_flag = True
elif(bool(re.search('"',df_tx['AndWords'][l]))==True and quote_handler(and_words, comment) == 'T'):
and_flag = True
elif(asterix_handler(and_words2, data_words)=='T'):
and_flag = True
if(str(nw_clean) == 'nan'):
not_flag = False
else:
if set(data_words) & not_words2:
not_flag = True
elif(bool(re.search('"',df_tx['NotWords'][l]))==True and quote_handler(not_words, comment) == 'T'):
not_flag = True
elif(asterix_handler(not_words2, data_words)=='T'):
not_flag = True
if(key_flag == True and and_flag == True and not_flag == False):
output.append(str(df_tx['Category2'][l]))
temp = {'slno': [data.iloc[k,0]], 'text': [data.iloc[k,1].strip()], 'Category': [df_tx['Category2'][l]]}
mapped_data = pd.concat([mapped_data,pd.DataFrame(temp)], sort = False)
#output mapped data
mapped_data = mapped_data[['slno', 'text', 'Category']]
mapped_data.to_csv("Path here/mapped_data.csv",index = False)
最终输出如下所示: