在推特数据中,我遇到了像“抵制熊”这样粘在一起的词,我希望它们成为“抵制”“熊”“人”
我试过了,但这很慢
def split(sentence, word_to_index):
# words to index dictionary of valid words
sentence_words = sentence.split()
lst12=[]
for w in sentence_words:
lst=[]
ws=w[:]
# Set the (i,j)th entry of X_indices to the index of the correct word.
while True:
for wd in lst:
ws = ws.replace(wd,"")
if ws=="":
lst=" ".join(lst)
lst12.append(lst)
break
wrdind = ws[:]
for kl in range(len(ws)):
try:
word_to_index[wrdind]
lst.append(wrdind)
break
except KeyError:
wrdind = wrdind[:len(wrdind)-1]
continue
return " ".join(lst12)
```