HTML单词卸妆?

数据挖掘 阿帕奇火花
2022-03-08 12:50:04

有谁知道是否有类似于StopWordsRemover但旨在清除 HTML 语法的功能?例如,在转换后获取没有任何 html 标签的文本。

1个回答

写了简单的课——如果有人感兴趣的话:

import org.apache.spark.ml.Transformer;
import org.apache.spark.ml.param.Param;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;

import java.util.UUID;

public class HTMLStripper extends Transformer {
private static final String HTMLStripper = "HTMLStripper";
private String inputColumn;
private String outputColumn;

public HTMLStripper (String inputColumn, String outputColumn) {
    this.inputColumn = inputColumn;
    this.outputColumn = outputColumn;
}


@Override
public String uid() {
    return UUID.fromString("HTMLStripper").toString();
}

@Override
public StructType transformSchema(StructType schema) {
    return schema.add(outputColumn, DataTypes.StringType, true);
}

@Override
public Dataset<Row> transform(Dataset<?> dataset) {
    dataset.sqlContext().udf().register(HTMLStripper, (String str) -> str.replaceAll("<[^>]*>", ""),
            DataTypes.StringType);
    Column col = dataset.col(inputColumn);
    col = functions.callUDF(HTMLStripper, col);
    return dataset.withColumn(outputColumn, col);
}

@Override
public Transformer copy(ParamMap extra) {
    return new HTMLStripper(inputColumn, outputColumn);
}
}