From 2adebe8de3881509e510fc518c562d1141ccd0ef Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 10 Aug 2016 13:40:18 +0800 Subject: [PATCH 1/7] add a chiSquare Selector based on False Positive Rate (FPR) test --- .../spark/mllib/feature/ChiSqSelector.scala | 29 +++++++++++++++++-- .../mllib/feature/ChiSqSelectorSuite.scala | 18 ++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index c8c2823bbaf04..f3316eeee8fec 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -173,8 +173,8 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { * Creates a ChiSquared feature selector. * @param numTopFeatures number of features that selector will select * (ordered by statistic value descending) - * Note that if the number of features is < numTopFeatures, then this will - * select all features. + * Note that if the number of features is less than numTopFeatures, + * then this will select all features. */ @Since("1.3.0") class ChiSqSelector @Since("1.3.0") ( @@ -197,3 +197,28 @@ class ChiSqSelector @Since("1.3.0") ( new ChiSqSelectorModel(indices) } } + +/** + * Creates a ChiSquared feature selector by False Positive Rate (FPR) test. + * @param alpha the highest p-value for features to be kept + */ +@Since("2.1.0") +class ChiSqSelectorByFpr @Since("2.1.0") ( + @Since("2.1.0") val alpha: Double) extends Serializable { + + /** + * Returns a ChiSquared feature selector by FPR. + * + * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features. + * Real-valued features will be treated as categorical for each distinct value. + * Apply feature discretizer before using this function. + */ + @Since("2.1.0") + def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { + val indices = Statistics.chiSqTest(data) + .zipWithIndex.filter { case (res, _) => res.pValue < alpha } + .map { case (_, indices) => indices } + .sorted + new ChiSqSelectorModel(indices) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index 734800a9afad6..6b2209c8a7c15 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -65,6 +65,24 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { assert(filteredData == preFilteredData) } + test("ChiSqSelectorByFpr transform test (sparse & dense vector)") { + val labeledDiscreteData = sc.parallelize( + Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), + LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) + val preFilteredData = + Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(4.0))), + LabeledPoint(1.0, Vectors.dense(Array(4.0))), + LabeledPoint(2.0, Vectors.dense(Array(9.0)))) + val model = new ChiSqSelectorByFpr(0.1).fit(labeledDiscreteData) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) + }.collect().toSet + assert(filteredData == preFilteredData) + } + test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() From 7623563884355a04867ce5271baa286f65180e62 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 16 Aug 2016 21:36:11 +0800 Subject: [PATCH 2/7] Configure the ChiSqSelector to reuse ChiSqTestResult by numTopFeatures, Percentile, and Fpr selector --- .../mllib/JavaChiSqSelectorExample.java | 3 +- .../examples/mllib/ChiSqSelectorExample.scala | 3 +- .../spark/ml/feature/ChiSqSelector.scala | 60 ++++++++++- .../mllib/api/python/PythonMLLibAPI.scala | 4 +- .../spark/mllib/feature/ChiSqSelector.scala | 102 +++++++++++------- .../mllib/feature/ChiSqSelectorSuite.scala | 6 +- 6 files changed, 126 insertions(+), 52 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java index ad44acb4cd6e3..f0619b7bc5685 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java @@ -56,7 +56,8 @@ public LabeledPoint call(LabeledPoint lp) { ); // Create ChiSqSelector that will select top 50 of 692 features - ChiSqSelector selector = new ChiSqSelector(50); + ChiSqSelector selector = new ChiSqSelector(); + selector.setNumTopFeatures(50); // Create ChiSqSelector model (selecting features) final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); // Filter the top 50 features from each feature vector diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala index 5e400b7d715b4..9fb520ce56acc 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala @@ -43,7 +43,8 @@ object ChiSqSelectorExample { LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor })) } // Create ChiSqSelector that will select top 50 of 692 features - val selector = new ChiSqSelector(50) + val selector = new ChiSqSelector() + selector.setNumTopFeatures(50) // Create ChiSqSelector model (selecting features) val transformer = selector.fit(discretizedData) // Filter the top 50 features from each feature vector diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 1482eb3d1f7a6..439514bdb4a4c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,6 +27,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature +import org.apache.spark.mllib.feature.SelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -51,11 +52,29 @@ private[feature] trait ChiSqSelectorParams extends Params " number of features is < numTopFeatures, then this will select all features.", ParamValidators.gtEq(1)) setDefault(numTopFeatures -> 50) + final val percentile = new IntParam(this, "percentile", + "Percentile of features that selector will select, ordered by statistics value descending.", + ParamValidators.gtEq(0)) + setDefault(percentile -> 10) + + final val alpha = new DoubleParam(this, "alpha", + "The highest p-value for features to be kept.", + ParamValidators.gtEq(0)) + setDefault(alpha -> 0.05) + + final val selectorType = SelectorType.KBest /** @group getParam */ def getNumTopFeatures: Int = $(numTopFeatures) + + def getPercentile: Int = $(percentile) + + def getAlpha: Double = $(alpha) + + def getSelectorType: SelectorType.Value = selectorType } + /** * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. @@ -66,10 +85,26 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def this() = this(Identifiable.randomUID("chiSqSelector")) - + val chiSqSelector = new feature.ChiSqSelector() /** @group setParam */ @Since("1.6.0") - def setNumTopFeatures(value: Int): this.type = set(numTopFeatures, value) + def setNumTopFeatures(value: Int): this.type = { + chiSqSelector.setNumTopFeatures(value) + chiSqSelector.setSelectorType(SelectorType.KBest) + set(numTopFeatures, value) + } + + def setPercentile(value: Int): this.type = { + chiSqSelector.setPercentile(value) + chiSqSelector.setSelectorType(SelectorType.Percentile) + set(percentile, value) + } + + def setAlpha(value: Double): this.type = { + chiSqSelector.setAlpha(value) + chiSqSelector.setSelectorType(SelectorType.Fpr) + set(alpha, value) + } /** @group setParam */ @Since("1.6.0") @@ -89,10 +124,25 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str val input: RDD[OldLabeledPoint] = dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => - OldLabeledPoint(label, OldVectors.fromML(features)) + OldLabeledPoint(label, OldVectors.fromML(features)) } - val chiSqSelector = new feature.ChiSqSelector($(numTopFeatures)).fit(input) - copyValues(new ChiSqSelectorModel(uid, chiSqSelector).setParent(this)) + val model = chiSqSelector.fit(input) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) + } + + def selectKBest(value: Int): ChiSqSelectorModel = { + val model = chiSqSelector.selectKBest(value) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) + } + + def selectPercentile(value: Int): ChiSqSelectorModel = { + val model = chiSqSelector.selectPercentile(value) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) + } + + def selectFpr(value: Double): ChiSqSelectorModel = { + val model = chiSqSelector.selectFpr(value) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a80cca70f4b28..bdcfe70651e3d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -634,8 +634,8 @@ private[python] class PythonMLLibAPI extends Serializable { * Extra care needs to be taken in the Python code to ensure it gets freed on * exit; see the Py4J documentation. */ - def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { - new ChiSqSelector(numTopFeatures).fit(data.rdd) + def fitChiSqSelector(data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().fit(data.rdd) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index f3316eeee8fec..9bc75c65165fd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -27,22 +27,26 @@ import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} +object SelectorType extends Enumeration { + type SelectorType = Value + val KBest, Percentile, Fpr = Value +} + /** * Chi Squared selector model. * - * @param selectedFeatures list of indices to select (filter). Must be ordered asc + * @param selectedFeatures list of indices to select (filter). */ @Since("1.3.0") class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable { - require(isSorted(selectedFeatures), "Array has to be sorted asc") - protected def isSorted(array: Array[Int]): Boolean = { var i = 1 val len = array.length @@ -69,21 +73,23 @@ class ChiSqSelectorModel @Since("1.3.0") ( * Preserves the order of filtered features the same as their indices are stored. * Might be moved to Vector as .slice * @param features vector - * @param filterIndices indices of features to filter, must be ordered asc + * @param filterIndices indices of features to filter */ private def compress(features: Vector, filterIndices: Array[Int]): Vector = { + val orderedIndices = filterIndices.sorted + require(isSorted(orderedIndices), "Array has to be sorted asc") features match { case SparseVector(size, indices, values) => - val newSize = filterIndices.length + val newSize = orderedIndices.length val newValues = new ArrayBuilder.ofDouble val newIndices = new ArrayBuilder.ofInt var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 - while (i < indices.length && j < filterIndices.length) { + while (i < indices.length && j < orderedIndices.length) { indicesIdx = indices(i) - filterIndicesIdx = filterIndices(j) + filterIndicesIdx = orderedIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += values(i) @@ -101,7 +107,7 @@ class ChiSqSelectorModel @Since("1.3.0") ( Vectors.sparse(newSize, newIndices.result(), newValues.result()) case DenseVector(values) => val values = features.toArray - Vectors.dense(filterIndices.map(i => values(i))) + Vectors.dense(orderedIndices.map(i => values(i))) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") @@ -171,14 +177,34 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. - * @param numTopFeatures number of features that selector will select - * (ordered by statistic value descending) - * Note that if the number of features is less than numTopFeatures, - * then this will select all features. */ @Since("1.3.0") -class ChiSqSelector @Since("1.3.0") ( - @Since("1.3.0") val numTopFeatures: Int) extends Serializable { +class ChiSqSelector @Since("1.3.0") () extends Serializable { + var numTopFeatures: Int = 1 + var percentile: Int = 10 + var alpha: Double = 0.05 + var selectorType = SelectorType.KBest + var chiSqTestResult: Array[ChiSqTestResult] = new Array[ChiSqTestResult](0) + + def setNumTopFeatures(value: Int): this.type = { + numTopFeatures = value + selectorType = SelectorType.KBest + this + } + def setPercentile(value: Int): this.type = { + percentile = value + selectorType = SelectorType.Percentile + this + } + def setAlpha(value: Double): this.type = { + alpha = value + selectorType = SelectorType.Fpr + this + } + def setSelectorType(value: SelectorType.Value): this.type = { + selectorType = value + this + } /** * Returns a ChiSquared feature selector. @@ -189,36 +215,32 @@ class ChiSqSelector @Since("1.3.0") ( */ @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { - val indices = Statistics.chiSqTest(data) - .zipWithIndex.sortBy { case (res, _) => -res.statistic } - .take(numTopFeatures) - .map { case (_, indices) => indices } - .sorted + chiSqTestResult = Statistics.chiSqTest(data) + selectorType match { + case SelectorType.KBest => selectKBest(numTopFeatures) + case SelectorType.Percentile => selectPercentile(percentile) + case SelectorType.Fpr => selectFpr(alpha) + } + } + + def selectKBest(value: Int): ChiSqSelectorModel = { + val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } + .take(numTopFeatures) + .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } -} -/** - * Creates a ChiSquared feature selector by False Positive Rate (FPR) test. - * @param alpha the highest p-value for features to be kept - */ -@Since("2.1.0") -class ChiSqSelectorByFpr @Since("2.1.0") ( - @Since("2.1.0") val alpha: Double) extends Serializable { + def selectPercentile(value: Int): ChiSqSelectorModel = { + val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } + .take((chiSqTestResult.length * percentile / 100).toInt) + .map { case (_, indices) => indices } + new ChiSqSelectorModel(indices) + } - /** - * Returns a ChiSquared feature selector by FPR. - * - * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features. - * Real-valued features will be treated as categorical for each distinct value. - * Apply feature discretizer before using this function. - */ - @Since("2.1.0") - def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { - val indices = Statistics.chiSqTest(data) - .zipWithIndex.filter { case (res, _) => res.pValue < alpha } - .map { case (_, indices) => indices } - .sorted + def selectFpr(value: Double): ChiSqSelectorModel = { + val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha } + .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } } + diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index 6b2209c8a7c15..d61888df9c0dc 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -58,14 +58,14 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) - val model = new ChiSqSelector(1).fit(labeledDiscreteData) + val model = new ChiSqSelector().fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } - test("ChiSqSelectorByFpr transform test (sparse & dense vector)") { + test("ChiSqSelector by FPR transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), @@ -76,7 +76,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) - val model = new ChiSqSelectorByFpr(0.1).fit(labeledDiscreteData) + val model = new ChiSqSelector().setAlpha(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet From 3d6aecb8441503c9c3d62a2d8a3d48824b9d6637 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 17 Aug 2016 10:34:59 +0800 Subject: [PATCH 3/7] Config the ChiSqSelector to reuse the ChiSqTestResult by KBest, Percentile and FPR selector --- .../mllib/JavaChiSqSelectorExample.java | 3 +- .../examples/mllib/ChiSqSelectorExample.scala | 3 +- .../spark/ml/feature/ChiSqSelector.scala | 12 +++--- .../mllib/api/python/PythonMLLibAPI.scala | 4 +- .../spark/mllib/feature/ChiSqSelector.scala | 38 ++++++++++--------- .../mllib/feature/ChiSqSelectorSuite.scala | 2 +- 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java index f0619b7bc5685..ad44acb4cd6e3 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java @@ -56,8 +56,7 @@ public LabeledPoint call(LabeledPoint lp) { ); // Create ChiSqSelector that will select top 50 of 692 features - ChiSqSelector selector = new ChiSqSelector(); - selector.setNumTopFeatures(50); + ChiSqSelector selector = new ChiSqSelector(50); // Create ChiSqSelector model (selecting features) final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); // Filter the top 50 features from each feature vector diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala index 9fb520ce56acc..5e400b7d715b4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala @@ -43,8 +43,7 @@ object ChiSqSelectorExample { LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor })) } // Create ChiSqSelector that will select top 50 of 692 features - val selector = new ChiSqSelector() - selector.setNumTopFeatures(50) + val selector = new ChiSqSelector(50) // Create ChiSqSelector model (selecting features) val transformer = selector.fit(discretizedData) // Filter the top 50 features from each feature vector diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 439514bdb4a4c..a44ac2fe73aea 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,7 +27,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature -import org.apache.spark.mllib.feature.SelectorType +import org.apache.spark.mllib.feature.ChiSqSelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -62,7 +62,7 @@ private[feature] trait ChiSqSelectorParams extends Params ParamValidators.gtEq(0)) setDefault(alpha -> 0.05) - final val selectorType = SelectorType.KBest + final val selectorType = ChiSqSelectorType.KBest /** @group getParam */ def getNumTopFeatures: Int = $(numTopFeatures) @@ -71,7 +71,7 @@ private[feature] trait ChiSqSelectorParams extends Params def getAlpha: Double = $(alpha) - def getSelectorType: SelectorType.Value = selectorType + def getChiSqSelectorType: ChiSqSelectorType.Value = selectorType } @@ -90,19 +90,19 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def setNumTopFeatures(value: Int): this.type = { chiSqSelector.setNumTopFeatures(value) - chiSqSelector.setSelectorType(SelectorType.KBest) + chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest) set(numTopFeatures, value) } def setPercentile(value: Int): this.type = { chiSqSelector.setPercentile(value) - chiSqSelector.setSelectorType(SelectorType.Percentile) + chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile) set(percentile, value) } def setAlpha(value: Double): this.type = { chiSqSelector.setAlpha(value) - chiSqSelector.setSelectorType(SelectorType.Fpr) + chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr) set(alpha, value) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index bdcfe70651e3d..a80cca70f4b28 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -634,8 +634,8 @@ private[python] class PythonMLLibAPI extends Serializable { * Extra care needs to be taken in the Python code to ensure it gets freed on * exit; see the Py4J documentation. */ - def fitChiSqSelector(data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { - new ChiSqSelector().fit(data.rdd) + def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector(numTopFeatures).fit(data.rdd) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 9bc75c65165fd..e2345b85a279e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} -object SelectorType extends Enumeration { +object ChiSqSelectorType extends Enumeration { type SelectorType = Value val KBest, Percentile, Fpr = Value } @@ -77,7 +77,6 @@ class ChiSqSelectorModel @Since("1.3.0") ( */ private def compress(features: Vector, filterIndices: Array[Int]): Vector = { val orderedIndices = filterIndices.sorted - require(isSorted(orderedIndices), "Array has to be sorted asc") features match { case SparseVector(size, indices, values) => val newSize = orderedIndices.length @@ -178,30 +177,34 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. */ -@Since("1.3.0") -class ChiSqSelector @Since("1.3.0") () extends Serializable { - var numTopFeatures: Int = 1 - var percentile: Int = 10 - var alpha: Double = 0.05 - var selectorType = SelectorType.KBest - var chiSqTestResult: Array[ChiSqTestResult] = new Array[ChiSqTestResult](0) - +@Since("2.1.0") +class ChiSqSelector @Since("2.1.0") () extends Serializable { + private var numTopFeatures: Int = 1 + private var percentile: Int = 10 + private var alpha: Double = 0.05 + private var selectorType = ChiSqSelectorType.KBest + private var chiSqTestResult: Array[ChiSqTestResult] = _ + + def this(numTopFeatures: Int) { + this() + this.numTopFeatures = numTopFeatures + } def setNumTopFeatures(value: Int): this.type = { numTopFeatures = value - selectorType = SelectorType.KBest + selectorType = ChiSqSelectorType.KBest this } def setPercentile(value: Int): this.type = { percentile = value - selectorType = SelectorType.Percentile + selectorType = ChiSqSelectorType.Percentile this } def setAlpha(value: Double): this.type = { alpha = value - selectorType = SelectorType.Fpr + selectorType = ChiSqSelectorType.Fpr this } - def setSelectorType(value: SelectorType.Value): this.type = { + def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = { selectorType = value this } @@ -217,9 +220,10 @@ class ChiSqSelector @Since("1.3.0") () extends Serializable { def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { chiSqTestResult = Statistics.chiSqTest(data) selectorType match { - case SelectorType.KBest => selectKBest(numTopFeatures) - case SelectorType.Percentile => selectPercentile(percentile) - case SelectorType.Fpr => selectFpr(alpha) + case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) + case ChiSqSelectorType.Percentile => selectPercentile(percentile) + case ChiSqSelectorType.Fpr => selectFpr(alpha) + case _ => throw new Exception } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index d61888df9c0dc..e181a544f7159 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -58,7 +58,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) - val model = new ChiSqSelector().fit(labeledDiscreteData) + val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet From b522c5a0d8cbd5a545396117ca60890853f8a6d9 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 17 Aug 2016 17:59:16 +0800 Subject: [PATCH 4/7] add Since annotation --- .../spark/ml/feature/ChiSqSelector.scala | 7 ++++- .../spark/mllib/feature/ChiSqSelector.scala | 27 ++++++++++++++----- .../spark/ml/feature/ChiSqSelectorSuite.scala | 11 ++++++-- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index a44ac2fe73aea..5cfc6f036e0a1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -87,19 +87,21 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str def this() = this(Identifiable.randomUID("chiSqSelector")) val chiSqSelector = new feature.ChiSqSelector() /** @group setParam */ - @Since("1.6.0") + @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { chiSqSelector.setNumTopFeatures(value) chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest) set(numTopFeatures, value) } + @Since("2.1.0") def setPercentile(value: Int): this.type = { chiSqSelector.setPercentile(value) chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile) set(percentile, value) } + @Since("2.1.0") def setAlpha(value: Double): this.type = { chiSqSelector.setAlpha(value) chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr) @@ -130,16 +132,19 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } + @Since("2.1.0") def selectKBest(value: Int): ChiSqSelectorModel = { val model = chiSqSelector.selectKBest(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } + @Since("2.1.0") def selectPercentile(value: Int): ChiSqSelectorModel = { val model = chiSqSelector.selectPercentile(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } + @Since("2.1.0") def selectFpr(value: Double): ChiSqSelectorModel = { val model = chiSqSelector.selectFpr(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index e2345b85a279e..a5069d414dc5c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,6 +33,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} +@Since("2.1.0") object ChiSqSelectorType extends Enumeration { type SelectorType = Value val KBest, Percentile, Fpr = Value @@ -179,31 +180,40 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { */ @Since("2.1.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { - private var numTopFeatures: Int = 1 + private var numTopFeatures: Int = 50 private var percentile: Int = 10 private var alpha: Double = 0.05 private var selectorType = ChiSqSelectorType.KBest private var chiSqTestResult: Array[ChiSqTestResult] = _ + @Since("1.3.0") def this(numTopFeatures: Int) { this() this.numTopFeatures = numTopFeatures } + + @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { numTopFeatures = value selectorType = ChiSqSelectorType.KBest this } + + @Since("2.1.0") def setPercentile(value: Int): this.type = { percentile = value selectorType = ChiSqSelectorType.Percentile this } + + @Since("2.1.0") def setAlpha(value: Double): this.type = { alpha = value selectorType = ChiSqSelectorType.Fpr this } + + @Since("2.1.0") def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = { selectorType = value this @@ -219,14 +229,15 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { chiSqTestResult = Statistics.chiSqTest(data) - selectorType match { - case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) - case ChiSqSelectorType.Percentile => selectPercentile(percentile) - case ChiSqSelectorType.Fpr => selectFpr(alpha) - case _ => throw new Exception - } + selectorType match { + case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) + case ChiSqSelectorType.Percentile => selectPercentile(percentile) + case ChiSqSelectorType.Fpr => selectFpr(alpha) + case _ => throw new Exception("Unknown ChiSqSelector Type") + } } + @Since("2.1.0") def selectKBest(value: Int): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) @@ -234,6 +245,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { new ChiSqSelectorModel(indices) } + @Since("2.1.0") def selectPercentile(value: Int): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } .take((chiSqTestResult.length * percentile / 100).toInt) @@ -241,6 +253,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { new ChiSqSelectorModel(indices) } + @Since("2.1.0") def selectFpr(value: Double): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha } .map { case (_, indices) => indices } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala index 3558290b23ae0..a29ff83ae0cce 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala @@ -49,16 +49,23 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") - val model = new ChiSqSelector() + val selector = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") - model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { + selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } + + selector.selectPercentile(34).transform(df) + .select("filtered", "preFilteredData").collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + } test("ChiSqSelector read/write") { From 3431a7a0dada65e06ab87d6cbac8fd36222c0a32 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Thu, 18 Aug 2016 11:00:42 +0800 Subject: [PATCH 5/7] Fix the bugs of Save/Load ML/ChiSqSelector --- .../spark/ml/feature/ChiSqSelector.scala | 43 +++++++++++++------ .../spark/mllib/feature/ChiSqSelector.scala | 8 ++-- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 5cfc6f036e0a1..a936790bf1697 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -52,26 +52,32 @@ private[feature] trait ChiSqSelectorParams extends Params " number of features is < numTopFeatures, then this will select all features.", ParamValidators.gtEq(1)) setDefault(numTopFeatures -> 50) + + /** @group getParam */ + def getNumTopFeatures: Int = $(numTopFeatures) + final val percentile = new IntParam(this, "percentile", "Percentile of features that selector will select, ordered by statistics value descending.", ParamValidators.gtEq(0)) setDefault(percentile -> 10) + /** @group getParam */ + def getPercentile: Int = $(percentile) + final val alpha = new DoubleParam(this, "alpha", "The highest p-value for features to be kept.", ParamValidators.gtEq(0)) setDefault(alpha -> 0.05) - final val selectorType = ChiSqSelectorType.KBest - /** @group getParam */ - def getNumTopFeatures: Int = $(numTopFeatures) - - def getPercentile: Int = $(percentile) - def getAlpha: Double = $(alpha) - def getChiSqSelectorType: ChiSqSelectorType.Value = selectorType + final val selectorType = new Param[String](this, "selectorType", + "ChiSqSelector Type: KBest, Percentile, Fpr") + setDefault(selectorType -> ChiSqSelectorType.KBest.toString) + + /** @group getParam */ + def getChiSqSelectorType: String = $(selectorType) } @@ -85,26 +91,23 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def this() = this(Identifiable.randomUID("chiSqSelector")) - val chiSqSelector = new feature.ChiSqSelector() + var chiSqSelector: feature.ChiSqSelector = null /** @group setParam */ @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { - chiSqSelector.setNumTopFeatures(value) - chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest) + set(selectorType, ChiSqSelectorType.KBest.toString) set(numTopFeatures, value) } @Since("2.1.0") def setPercentile(value: Int): this.type = { - chiSqSelector.setPercentile(value) - chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile) + set(selectorType, ChiSqSelectorType.Percentile.toString) set(percentile, value) } @Since("2.1.0") def setAlpha(value: Double): this.type = { - chiSqSelector.setAlpha(value) - chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr) + set(selectorType, ChiSqSelectorType.Fpr.toString) set(alpha, value) } @@ -128,24 +131,36 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str case Row(label: Double, features: Vector) => OldLabeledPoint(label, OldVectors.fromML(features)) } + $(selectorType) match { + case "KBest" => + chiSqSelector = new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)) + case "Percentile" => + chiSqSelector = new feature.ChiSqSelector().setPercentile($(percentile)) + case "Fpr" => + chiSqSelector = new feature.ChiSqSelector().setAlpha($(alpha)) + case _ => throw new Exception("Unknown ChiSqSelector Type.") + } val model = chiSqSelector.fit(input) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } @Since("2.1.0") def selectKBest(value: Int): ChiSqSelectorModel = { + require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectKBest(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } @Since("2.1.0") def selectPercentile(value: Int): ChiSqSelectorModel = { + require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectPercentile(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } @Since("2.1.0") def selectFpr(value: Double): ChiSqSelectorModel = { + require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectFpr(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index a5069d414dc5c..b58e286560388 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -191,28 +191,28 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { this() this.numTopFeatures = numTopFeatures } - + @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { numTopFeatures = value selectorType = ChiSqSelectorType.KBest this } - + @Since("2.1.0") def setPercentile(value: Int): this.type = { percentile = value selectorType = ChiSqSelectorType.Percentile this } - + @Since("2.1.0") def setAlpha(value: Double): this.type = { alpha = value selectorType = ChiSqSelectorType.Fpr this } - + @Since("2.1.0") def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = { selectorType = value From 89e2dd5df055d55fe627e679c9c790d592203c8d Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Thu, 18 Aug 2016 12:39:48 +0800 Subject: [PATCH 6/7] Change ChiSqSelectorType in ml to String --- .../spark/ml/feature/ChiSqSelector.scala | 19 +++++++++---------- .../spark/mllib/feature/ChiSqSelector.scala | 6 +++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index a936790bf1697..5a53930135f18 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,7 +27,6 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature -import org.apache.spark.mllib.feature.ChiSqSelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -56,13 +55,13 @@ private[feature] trait ChiSqSelectorParams extends Params /** @group getParam */ def getNumTopFeatures: Int = $(numTopFeatures) - final val percentile = new IntParam(this, "percentile", + final val percentile = new DoubleParam(this, "percentile", "Percentile of features that selector will select, ordered by statistics value descending.", ParamValidators.gtEq(0)) setDefault(percentile -> 10) /** @group getParam */ - def getPercentile: Int = $(percentile) + def getPercentile: Double = $(percentile) final val alpha = new DoubleParam(this, "alpha", "The highest p-value for features to be kept.", @@ -74,7 +73,7 @@ private[feature] trait ChiSqSelectorParams extends Params final val selectorType = new Param[String](this, "selectorType", "ChiSqSelector Type: KBest, Percentile, Fpr") - setDefault(selectorType -> ChiSqSelectorType.KBest.toString) + setDefault(selectorType -> "KBest") /** @group getParam */ def getChiSqSelectorType: String = $(selectorType) @@ -95,19 +94,19 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str /** @group setParam */ @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { - set(selectorType, ChiSqSelectorType.KBest.toString) + set(selectorType, "KBest") set(numTopFeatures, value) } @Since("2.1.0") - def setPercentile(value: Int): this.type = { - set(selectorType, ChiSqSelectorType.Percentile.toString) + def setPercentile(value: Double): this.type = { + set(selectorType, "Percentile") set(percentile, value) } @Since("2.1.0") def setAlpha(value: Double): this.type = { - set(selectorType, ChiSqSelectorType.Fpr.toString) + set(selectorType, "Fpr") set(alpha, value) } @@ -129,7 +128,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str val input: RDD[OldLabeledPoint] = dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => - OldLabeledPoint(label, OldVectors.fromML(features)) + OldLabeledPoint(label, OldVectors.fromML(features)) } $(selectorType) match { case "KBest" => @@ -152,7 +151,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str } @Since("2.1.0") - def selectPercentile(value: Int): ChiSqSelectorModel = { + def selectPercentile(value: Double): ChiSqSelectorModel = { require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectPercentile(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index b58e286560388..eb4a24214efd5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -181,7 +181,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { @Since("2.1.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { private var numTopFeatures: Int = 50 - private var percentile: Int = 10 + private var percentile: Double = 10 private var alpha: Double = 0.05 private var selectorType = ChiSqSelectorType.KBest private var chiSqTestResult: Array[ChiSqTestResult] = _ @@ -200,7 +200,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { } @Since("2.1.0") - def setPercentile(value: Int): this.type = { + def setPercentile(value: Double): this.type = { percentile = value selectorType = ChiSqSelectorType.Percentile this @@ -246,7 +246,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { } @Since("2.1.0") - def selectPercentile(value: Int): ChiSqSelectorModel = { + def selectPercentile(value: Double): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } .take((chiSqTestResult.length * percentile / 100).toInt) .map { case (_, indices) => indices } From ab96c060382ec3fe3d1774d02a25cd0fbf9a2544 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Thu, 18 Aug 2016 12:52:41 +0800 Subject: [PATCH 7/7] Delete White line, and add Since --- .../scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 4 +++- .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 5a53930135f18..d6b847a7770b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -79,7 +79,6 @@ private[feature] trait ChiSqSelectorParams extends Params def getChiSqSelectorType: String = $(selectorType) } - /** * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. @@ -90,7 +89,10 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def this() = this(Identifiable.randomUID("chiSqSelector")) + + @Since("2.1.0") var chiSqSelector: feature.ChiSqSelector = null + /** @group setParam */ @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index eb4a24214efd5..1c3b49a04b843 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -181,7 +181,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { @Since("2.1.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { private var numTopFeatures: Int = 50 - private var percentile: Double = 10 + private var percentile: Double = 10.0 private var alpha: Double = 0.05 private var selectorType = ChiSqSelectorType.KBest private var chiSqTestResult: Array[ChiSqTestResult] = _ @@ -260,4 +260,3 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { new ChiSqSelectorModel(indices) } } -