csli.util.classify.stanford
Class ClassifierUtils

java.lang.Object
  extended by csli.util.classify.stanford.ClassifierUtils

public abstract class ClassifierUtils
extends Object


Constructor Summary
ClassifierUtils()
           
 
Method Summary
static
<D extends edu.stanford.nlp.dbm.Datum>
Map<Integer,Double>
getFeatures(D datum, Map<String,Integer> featureMap)
           
static Pair<String,Double> getValuedFeature(Object feature)
           
static
<D extends edu.stanford.nlp.dbm.Datum>
List<Pair<Double,Double>>
normalizeFeatures(List<D> examples, Map<String,Integer> featureMap)
           
static double pLogP(double p)
          Convenience method to get round the fact that multiplying zero by -Inf gives NaN
static
<D extends edu.stanford.nlp.dbm.Datum>
void
prune(List<D> examples, Set<String> toPrune)
           
static
<D extends edu.stanford.nlp.dbm.Datum>
Set<String>
pruneFeatures(List<D> examples, List<String> specs, String posLabel, String negLabel)
           
static
<D extends edu.stanford.nlp.dbm.Datum>
Set<String>
pruneFeaturesByCorrelation(List<D> examples, double margin, String posLabel, String negLabel)
           
static
<D extends edu.stanford.nlp.dbm.Datum>
Set<String>
pruneFeaturesByFreq(List<D> examples, double threshold)
           
static
<D extends edu.stanford.nlp.dbm.Datum>
Set<String>
pruneFeaturesByInfoGain(List<D> examples, double margin, String posLabel, String negLabel)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

ClassifierUtils

public ClassifierUtils()
Method Detail

getFeatures

public static <D extends edu.stanford.nlp.dbm.Datum> Map<Integer,Double> getFeatures(D datum,
                                                                                     Map<String,Integer> featureMap)
Type Parameters:
D - a subclass of Datum
Parameters:
datum - the data instance
featureMap - a map of feature label to feature index number, or null to assume ordered numeric features
Returns:
a map from feature index number to feature value

getValuedFeature

public static Pair<String,Double> getValuedFeature(Object feature)
Parameters:
feature - a feature, either as a ScoredObject or just a plain Object
Returns:
a Pair of the String feature and its Double value (the score of a ScoredObject, or 1.0 otherwise)

normalizeFeatures

public static <D extends edu.stanford.nlp.dbm.Datum> List<Pair<Double,Double>> normalizeFeatures(List<D> examples,
                                                                                                 Map<String,Integer> featureMap)
Type Parameters:
D - a subclass of Datum
Parameters:
examples - a List of data instances to be normalized
featureMap - a map of feature label to feature number, or null to assume ordered numeric features
Returns:
a List of Pairs of Doubles which record the normalization: f_norm = (f_raw-a)*b

pruneFeatures

public static <D extends edu.stanford.nlp.dbm.Datum> Set<String> pruneFeatures(List<D> examples,
                                                                               List<String> specs,
                                                                               String posLabel,
                                                                               String negLabel)
Type Parameters:
D -
Parameters:
examples - a list of Datum instances from which to prune features
specs - a list of String pruning specifications
posLabel -
negLabel -
Returns:
the set of feature labels removed

pruneFeaturesByFreq

public static <D extends edu.stanford.nlp.dbm.Datum> Set<String> pruneFeaturesByFreq(List<D> examples,
                                                                                     double threshold)
Type Parameters:
D -
Parameters:
examples - a list of Datum instances from which to prune features
threshold - the value of frequency below which features will be pruned
Returns:
the set of feature labels removed

pruneFeaturesByCorrelation

public static <D extends edu.stanford.nlp.dbm.Datum> Set<String> pruneFeaturesByCorrelation(List<D> examples,
                                                                                            double margin,
                                                                                            String posLabel,
                                                                                            String negLabel)
Type Parameters:
D -
Parameters:
examples - a list of Datum instances from which to prune features
posLabel -
negLabel -
Returns:
the set of feature labels removed

pruneFeaturesByInfoGain

public static <D extends edu.stanford.nlp.dbm.Datum> Set<String> pruneFeaturesByInfoGain(List<D> examples,
                                                                                         double margin,
                                                                                         String posLabel,
                                                                                         String negLabel)
Type Parameters:
D -
Parameters:
examples - a list of Datum instances from which to prune features
posLabel -
negLabel -
Returns:
the set of feature labels removed

prune

public static <D extends edu.stanford.nlp.dbm.Datum> void prune(List<D> examples,
                                                                Set<String> toPrune)
Type Parameters:
D -
Parameters:
examples - a list of Datum instances from which to prune features
toPrune - a set of feature labels to remove

pLogP

public static double pLogP(double p)
Convenience method to get round the fact that multiplying zero by -Inf gives NaN

Parameters:
p - a double probability
Returns:
p*Math.log(p), which will be 0 if p=0