谢谢大家,我一定会测试 FuzzyWuzzy Lib。同时请看一下我昨天做的代码(它可以被增强,如果你能做得更好,请继续:-
#define function to read the CSV files
def GetDocument(link):
dataframe = pd.read_csv(link, encoding = "ISO-8859-1", sep = ';')
return dataframe
#define function convert text/CSV data to lists
def ColumnToList(column):
return column.tolist()
#read document csv
document1 = GetDocument('C:/Users/smegrhi/Desktop/DOC1.csv')
document2 = GetDocument('C:/Users/smegrhi/Desktop/DOC2.csv')
#get columns into lists
list1 = ColumnToList(document1['DIA_LBL'])
list2 = ColumnToList(document2['ICD10_LBL_FR'])
#Data pre-processing
def ProcessText(sentence):
#Convert to lower case
sentence = sentence.lower()
#Convert www.* or https?://* to URL
sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',sentence)
#Convert
sentence = re.sub('@[^\s]+','',sentence)
#Remove additional white spaces
sentence = re.sub('[\s]+', ' ', sentence)
#Replace
sentence = re.sub(r'#([^\s]+)', r'\1', sentence)
return sentence
#execute
for i,text in enumerate(list1):
res = ProcessText(text)
list1[i] = res
#vectorize the data
WORD = re.compile(r'\w+')
#function vectorize data
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
#calculate similarity
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
# compare sentence from list 1 with list 2 : Choose Indication from Data/DOC1
text1 = list1[1]
#text1 = shigellose à shigella dysenteriae
vector1 = text_to_vector(text1)
listCosine = []
for i,text in enumerate(list2):
vector2 = text_to_vector(text)
cosine = get_cosine(vector1, vector2)
listCosine.append(cosine)
结果没有那么糟糕也没有那么好......你的建议?