有谁知道是否有类似于StopWordsRemover但旨在清除 HTML 语法的功能?例如,在转换后获取没有任何 html 标签的文本。
HTML单词卸妆?
数据挖掘
阿帕奇火花
2022-03-08 12:50:04
1个回答
写了简单的课——如果有人感兴趣的话:
import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import java.util.UUID;
public class HTMLStripper extends Transformer {
private static final String HTMLStripper = "HTMLStripper";
private String inputColumn;
private String outputColumn;
public HTMLStripper (String inputColumn, String outputColumn) {
this.inputColumn = inputColumn;
this.outputColumn = outputColumn;
}
@Override
public String uid() {
return UUID.fromString("HTMLStripper").toString();
}
@Override
public StructType transformSchema(StructType schema) {
return schema.add(outputColumn, DataTypes.StringType, true);
}
@Override
public Dataset<Row> transform(Dataset<?> dataset) {
dataset.sqlContext().udf().register(HTMLStripper, (String str) -> str.replaceAll("<[^>]*>", ""),
DataTypes.StringType);
Column col = dataset.col(inputColumn);
col = functions.callUDF(HTMLStripper, col);
return dataset.withColumn(outputColumn, col);
}
@Override
public Transformer copy(ParamMap extra) {
return new HTMLStripper(inputColumn, outputColumn);
}
}
其它你可能感兴趣的问题