/*
 * Decompiled with CFR 0.152.
 */
package weka.filters.unsupervised.attribute;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.Stopwords;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

public class StringToWordVector
extends Filter
implements UnsupervisedFilter,
OptionHandler {
    static final long serialVersionUID = 8249106275278565424L;
    private String delimiters = " \n\t.,:'\"()?!";
    protected Range m_SelectedRange = null;
    private TreeMap m_Dictionary = new TreeMap();
    private boolean m_OutputCounts = false;
    private String m_Prefix = "";
    private int[] docsCounts;
    private int numInstances = -1;
    private double avgDocLength = -1.0;
    private int m_WordsToKeep = 1000;
    private boolean m_TFTransform;
    protected int m_filterType = 0;
    public static final int FILTER_NONE = 0;
    public static final int FILTER_NORMALIZE_ALL = 1;
    public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
    public static final Tag[] TAGS_FILTER = new Tag[]{new Tag(0, "No normalization"), new Tag(1, "Normalize all data"), new Tag(2, "Normalize test data only")};
    private boolean m_IDFTransform;
    private boolean m_onlyAlphabeticTokens;
    private boolean m_lowerCaseTokens;
    private boolean m_useStoplist;
    private Stemmer m_Stemmer = new NullStemmer();
    private int m_minTermFreq = 1;
    private boolean m_doNotOperateOnPerClassBasis = false;

    public Enumeration listOptions() {
        Vector<Option> vector = new Vector<Option>();
        vector.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        vector.addElement(new Option("\tString containing the set of delimiter characters\n\t(default: \" \\n\\t.,:'\\\"()?!\")", "D", 1, "-D <delimiter set>"));
        vector.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        vector.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        vector.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        vector.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        vector.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of  documents containing word i)\n\t  where fij if frequency of word i in  jth document(instance)", "I", 0, "-I"));
        vector.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        vector.addElement(new Option("\tOnly form tokens from contiguous alphabetic sequences\n\t(The delimiter string is ignored if this is set).", "A", 0, "-A"));
        vector.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        vector.addElement(new Option("\tIgnore words that are in the stoplist.", "S", 0, "-S"));
        vector.addElement(new Option("\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        vector.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        vector.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        return vector.elements();
    }

    public void setOptions(String[] stringArray) throws Exception {
        String string = Utils.getOption('D', stringArray);
        if (string.length() != 0) {
            this.setDelimiters(string);
        }
        if ((string = Utils.getOption('R', stringArray)).length() != 0) {
            this.setSelectedRange(string);
        }
        if ((string = Utils.getOption('P', stringArray)).length() != 0) {
            this.setAttributeNamePrefix(string);
        }
        if ((string = Utils.getOption('W', stringArray)).length() != 0) {
            this.setWordsToKeep(Integer.valueOf(string));
        }
        if ((string = Utils.getOption('M', stringArray)).length() != 0) {
            this.setMinTermFreq(Integer.valueOf(string));
        }
        this.setOutputWordCounts(Utils.getFlag('C', stringArray));
        this.setTFTransform(Utils.getFlag('T', stringArray));
        this.setIDFTransform(Utils.getFlag('I', stringArray));
        this.setDoNotOperateOnPerClassBasis(Utils.getFlag('O', stringArray));
        String string2 = Utils.getOption('N', stringArray);
        if (string2.length() != 0) {
            this.setNormalizeDocLength(new SelectedTag(Integer.parseInt(string2), TAGS_FILTER));
        } else {
            this.setNormalizeDocLength(new SelectedTag(0, TAGS_FILTER));
        }
        this.setLowerCaseTokens(Utils.getFlag('L', stringArray));
        this.setOnlyAlphabeticTokens(Utils.getFlag('A', stringArray));
        this.setUseStoplist(Utils.getFlag('S', stringArray));
        String string3 = Utils.getOption("stemmer", stringArray);
        if (string3.length() == 0) {
            this.setStemmer(null);
        } else {
            String[] stringArray2 = Utils.splitOptions(string3);
            if (stringArray2.length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            String string4 = stringArray2[0];
            stringArray2[0] = "";
            Stemmer stemmer = (Stemmer)Class.forName(string4).newInstance();
            if (stemmer instanceof OptionHandler) {
                ((OptionHandler)((Object)stemmer)).setOptions(stringArray2);
            }
            this.setStemmer(stemmer);
        }
    }

    public String[] getOptions() {
        String[] stringArray = new String[22];
        int n = 0;
        stringArray[n++] = "-D";
        stringArray[n++] = this.getDelimiters();
        if (this.getSelectedRange() != null) {
            stringArray[n++] = "-R";
            this.m_SelectedRange.setUpper(this.getInputFormat().numAttributes() - 1);
            stringArray[n++] = this.getSelectedRange().getRanges();
        }
        if (!"".equals(this.getAttributeNamePrefix())) {
            stringArray[n++] = "-P";
            stringArray[n++] = this.getAttributeNamePrefix();
        }
        stringArray[n++] = "-W";
        stringArray[n++] = String.valueOf(this.getWordsToKeep());
        if (this.getOutputWordCounts()) {
            stringArray[n++] = "-C";
        }
        if (this.getTFTransform()) {
            stringArray[n++] = "-T";
        }
        if (this.getIDFTransform()) {
            stringArray[n++] = "-I";
        }
        stringArray[n++] = "-N";
        stringArray[n++] = "" + this.m_filterType;
        if (this.getLowerCaseTokens()) {
            stringArray[n++] = "-L";
        }
        if (this.getOnlyAlphabeticTokens()) {
            stringArray[n++] = "-A";
        }
        if (this.getUseStoplist()) {
            stringArray[n++] = "-S";
        }
        if (this.getStemmer() != null) {
            stringArray[n++] = "-stemmer";
            String string = this.getStemmer().getClass().getName();
            if (this.getStemmer() instanceof OptionHandler) {
                string = string + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStemmer())).getOptions());
            }
            stringArray[n++] = string.trim();
        }
        stringArray[n++] = "-M";
        stringArray[n++] = String.valueOf(this.getMinTermFreq());
        if (this.getDoNotOperateOnPerClassBasis()) {
            stringArray[n++] = "-O";
        }
        while (n < stringArray.length) {
            stringArray[n++] = "";
        }
        return stringArray;
    }

    public StringToWordVector() {
    }

    public StringToWordVector(int n) {
        this.m_WordsToKeep = n;
    }

    public Capabilities getCapabilities() {
        Capabilities capabilities = super.getCapabilities();
        capabilities.enableAllAttributes();
        capabilities.enable(Capabilities.Capability.MISSING_VALUES);
        capabilities.enableAllClasses();
        capabilities.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        capabilities.enable(Capabilities.Capability.NO_CLASS);
        return capabilities;
    }

    public boolean setInputFormat(Instances instances) throws Exception {
        super.setInputFormat(instances);
        this.avgDocLength = -1.0;
        this.numInstances = -1;
        return false;
    }

    public boolean input(Instance instance) throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (this.m_NewBatch) {
            this.resetQueue();
            this.m_NewBatch = false;
        }
        if (this.isFirstBatchDone()) {
            FastVector fastVector = new FastVector();
            int n = this.convertInstancewoDocNorm(instance, fastVector);
            Instance instance2 = (Instance)fastVector.elementAt(0);
            if (this.m_filterType != 0) {
                this.normalizeInstance(instance2, n);
            }
            this.push(instance2);
            return true;
        }
        this.bufferInput(instance);
        return false;
    }

    public boolean batchFinished() throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (!this.isFirstBatchDone()) {
            int n;
            this.determineDictionary();
            FastVector fastVector = new FastVector();
            int n2 = 0;
            for (n = 0; n < this.numInstances; ++n) {
                n2 = this.convertInstancewoDocNorm(this.getInputFormat().instance(n), fastVector);
            }
            if (this.m_filterType != 0) {
                this.avgDocLength = 0.0;
                for (n = 0; n < fastVector.size(); ++n) {
                    Instance instance = (Instance)fastVector.elementAt(n);
                    double d = 0.0;
                    for (int i = 0; i < instance.numValues(); ++i) {
                        if (instance.index(i) < n2) continue;
                        d += instance.valueSparse(i) * instance.valueSparse(i);
                    }
                    this.avgDocLength += Math.sqrt(d);
                }
                this.avgDocLength /= (double)this.numInstances;
            }
            if (this.m_filterType == 1) {
                for (n = 0; n < fastVector.size(); ++n) {
                    this.normalizeInstance((Instance)fastVector.elementAt(n), n2);
                }
            }
            for (n = 0; n < fastVector.size(); ++n) {
                this.push((Instance)fastVector.elementAt(n));
            }
        }
        this.flushInput();
        this.m_NewBatch = true;
        this.m_FirstBatchDone = true;
        return this.numPendingOutput() != 0;
    }

    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data).";
    }

    public boolean getOutputWordCounts() {
        return this.m_OutputCounts;
    }

    public void setOutputWordCounts(boolean bl) {
        this.m_OutputCounts = bl;
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public String getDelimiters() {
        return this.delimiters.replaceAll("\"", "\\\\\"").replaceAll("'", "\\\\'");
    }

    public void setDelimiters(String string) {
        this.delimiters = string.replaceAll("\\\\\"", "\"").replaceAll("\\\\'", "'");
    }

    public String delimitersTipText() {
        return "Set of delimiter characters to use in tokenizing (default: \" \\n\\t.,:'\\\"()?!\"). This option is ignored if onlyAlphabeticTokens option is set to true.";
    }

    public Range getSelectedRange() {
        return this.m_SelectedRange;
    }

    public void setSelectedRange(String string) {
        this.m_SelectedRange = new Range(string);
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    public void setAttributeNamePrefix(String string) {
        this.m_Prefix = string;
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public int getWordsToKeep() {
        return this.m_WordsToKeep;
    }

    public void setWordsToKeep(int n) {
        this.m_WordsToKeep = n;
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public void setTFTransform(boolean bl) {
        this.m_TFTransform = bl;
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public void setIDFTransform(boolean bl) {
        this.m_IDFTransform = bl;
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public SelectedTag getNormalizeDocLength() {
        return new SelectedTag(this.m_filterType, TAGS_FILTER);
    }

    public void setNormalizeDocLength(SelectedTag selectedTag) {
        if (selectedTag.getTags() == TAGS_FILTER) {
            this.m_filterType = selectedTag.getSelectedTag().getID();
        }
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getOnlyAlphabeticTokens() {
        return this.m_onlyAlphabeticTokens;
    }

    public void setOnlyAlphabeticTokens(boolean bl) {
        this.m_onlyAlphabeticTokens = bl;
    }

    public String onlyAlphabeticTokensTipText() {
        return "Sets whether if the word tokens are to be formed only from contiguous alphabetic sequences (The delimiter string is ignored if this option is set to true).";
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public void setLowerCaseTokens(boolean bl) {
        this.m_lowerCaseTokens = bl;
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_doNotOperateOnPerClassBasis;
    }

    public void setDoNotOperateOnPerClassBasis(boolean bl) {
        this.m_doNotOperateOnPerClassBasis = bl;
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public int getMinTermFreq() {
        return this.m_minTermFreq;
    }

    public void setMinTermFreq(int n) {
        this.m_minTermFreq = n;
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public boolean getUseStoplist() {
        return this.m_useStoplist;
    }

    public void setUseStoplist(boolean bl) {
        this.m_useStoplist = bl;
    }

    public String useStoplistTipText() {
        return "Ignores all the words that are on the stoplist, if set to true.";
    }

    public void setStemmer(Stemmer stemmer) {
        this.m_Stemmer = stemmer != null ? stemmer : new NullStemmer();
    }

    public Stemmer getStemmer() {
        return this.m_Stemmer;
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    private static void sortArray(int[] nArray) {
        int n = nArray.length - 1;
        int n2 = 1;
        while (n2 <= n / 9) {
            n2 = 3 * n2 + 1;
        }
        while (n2 > 0) {
            for (int i = n2 + 1; i <= n; ++i) {
                int n3 = nArray[i];
                for (int j = i; j > n2 && nArray[j - n2] > n3; j -= n2) {
                    nArray[j] = nArray[j - n2];
                }
                nArray[j] = n3;
            }
            n2 /= 3;
        }
    }

    private void determineSelectedRange() {
        int n;
        StringBuffer stringBuffer;
        Instances instances = this.getInputFormat();
        if (this.m_SelectedRange == null) {
            stringBuffer = new StringBuffer();
            for (n = 0; n < instances.numAttributes(); ++n) {
                if (instances.attribute(n).type() != 2) continue;
                stringBuffer.append(n + 1 + ",");
            }
            this.m_SelectedRange = new Range(stringBuffer.toString());
        }
        this.m_SelectedRange.setUpper(instances.numAttributes() - 1);
        stringBuffer = new StringBuffer();
        for (n = 0; n < instances.numAttributes(); ++n) {
            if (!this.m_SelectedRange.isInRange(n) || instances.attribute(n).type() != 2) continue;
            stringBuffer.append(n + 1 + ",");
        }
        this.m_SelectedRange.setRanges(stringBuffer.toString());
        this.m_SelectedRange.setUpper(instances.numAttributes() - 1);
    }

    private void determineDictionary() {
        Object object6;
        Object object2;
        int n;
        Object object3;
        int n2;
        int n3 = this.getInputFormat().classIndex();
        int n4 = 1;
        if (!this.m_doNotOperateOnPerClassBasis && n3 != -1) {
            n4 = this.getInputFormat().attribute(n3).numValues();
        }
        TreeMap[] treeMapArray = new TreeMap[n4];
        for (n2 = 0; n2 < n4; ++n2) {
            treeMapArray[n2] = new TreeMap();
        }
        this.determineSelectedRange();
        for (n2 = 0; n2 < this.getInputFormat().numInstances(); ++n2) {
            Object object4;
            Object object5;
            object3 = this.getInputFormat().instance(n2);
            n = 0;
            if (!this.m_doNotOperateOnPerClassBasis && n3 != -1) {
                n = (int)((Instance)object3).classValue();
            }
            object2 = new Hashtable();
            for (int i = 0; i < ((Instance)object3).numAttributes(); ++i) {
                if (!this.m_SelectedRange.isInRange(i) || ((Instance)object3).isMissing(i)) continue;
                object5 = !this.m_onlyAlphabeticTokens ? new StringTokenizer(((Instance)object3).stringValue(i), this.delimiters) : new AlphabeticStringTokenizer(((Instance)object3).stringValue(i));
                while (object5.hasMoreElements()) {
                    object4 = ((String)object5.nextElement()).intern();
                    if (this.m_lowerCaseTokens) {
                        object4 = ((String)object4).toLowerCase();
                    }
                    object4 = this.m_Stemmer.stem((String)object4);
                    if (this.m_useStoplist && Stopwords.isStopword((String)object4)) continue;
                    if (!((Hashtable)object2).contains(object4)) {
                        ((Hashtable)object2).put(object4, new Integer(0));
                    }
                    if ((object6 = (Count)treeMapArray[n].get(object4)) == null) {
                        treeMapArray[n].put(object4, new Count(1));
                        continue;
                    }
                    ++((Count)object6).count;
                }
            }
            Enumeration enumeration = ((Hashtable)object2).keys();
            while (enumeration.hasMoreElements()) {
                object5 = (String)enumeration.nextElement();
                object4 = (Count)treeMapArray[n].get(object5);
                if (object4 != null) {
                    ++((Count)object4).docCount;
                    continue;
                }
                System.err.println("Warning: A word should definitely be in the dictionary.Please check the code");
            }
        }
        n2 = 0;
        object3 = new int[n4];
        for (n = 0; n < n4; ++n) {
            n2 += treeMapArray[n].size();
            object2 = new int[treeMapArray[n].size()];
            int n5 = 0;
            for (Object object4 : treeMapArray[n].keySet()) {
                object6 = (Count)treeMapArray[n].get(object4);
                object2[n5] = ((Count)object6).count;
                ++n5;
            }
            StringToWordVector.sortArray((int[])object2);
            object3[n] = ((Object)object2).length < this.m_WordsToKeep ? (Object)this.m_minTermFreq : (Object)Math.max(this.m_minTermFreq, (int)object2[((Object)object2).length - this.m_WordsToKeep]);
        }
        FastVector fastVector = new FastVector(n2 + this.getInputFormat().numAttributes());
        int n6 = -1;
        for (int i = 0; i < this.getInputFormat().numAttributes(); ++i) {
            if (this.m_SelectedRange.isInRange(i)) continue;
            if (this.getInputFormat().classIndex() == i) {
                n6 = fastVector.size();
            }
            fastVector.addElement(this.getInputFormat().attribute(i).copy());
        }
        TreeMap<String, Integer> treeMap = new TreeMap<String, Integer>();
        int n7 = fastVector.size();
        for (int i = 0; i < n4; ++i) {
            for (String string : treeMapArray[i].keySet()) {
                Count count = (Count)treeMapArray[i].get(string);
                if (count.count < object3[i] || treeMap.get(string) != null) continue;
                treeMap.put(string, new Integer(n7++));
                fastVector.addElement(new Attribute(this.m_Prefix + string));
            }
        }
        this.docsCounts = new int[fastVector.size()];
        for (Object object6 : treeMap.keySet()) {
            int n8 = (Integer)treeMap.get(object6);
            int n9 = 0;
            for (int i = 0; i < n4; ++i) {
                Count count = (Count)treeMapArray[i].get(object6);
                if (count == null) continue;
                n9 += count.docCount;
            }
            this.docsCounts[n8] = n9;
        }
        fastVector.trimToSize();
        this.m_Dictionary = treeMap;
        this.numInstances = this.getInputFormat().numInstances();
        object6 = new Instances(this.getInputFormat().relationName(), fastVector, 0);
        ((Instances)object6).setClassIndex(n6);
        this.setOutputFormat((Instances)object6);
    }

    private int convertInstancewoDocNorm(Instance instance, FastVector fastVector) {
        Number number;
        Object object;
        int n;
        TreeMap<Object, Double> treeMap = new TreeMap<Object, Double>();
        int n2 = 0;
        for (n = 0; n < this.getInputFormat().numAttributes(); ++n) {
            if (this.m_SelectedRange.isInRange(n)) continue;
            if (this.getInputFormat().attribute(n).type() != 2) {
                if (instance.value(n) != 0.0) {
                    treeMap.put(new Integer(n2), new Double(instance.value(n)));
                }
            } else if (instance.isMissing(n)) {
                treeMap.put(new Integer(n2), new Double(Instance.missingValue()));
            } else {
                if (this.outputFormatPeek().attribute(n2).numValues() == 0) {
                    this.outputFormatPeek().attribute(n2).addStringValue("Hack to defeat SparseInstance bug");
                }
                int n3 = this.outputFormatPeek().attribute(n2).addStringValue(instance.stringValue(n));
                treeMap.put(new Integer(n2), new Double(n3));
            }
            ++n2;
        }
        for (n = 0; n < instance.numAttributes(); ++n) {
            if (!this.m_SelectedRange.isInRange(n) || instance.isMissing(n)) continue;
            Enumeration enumeration = !this.m_onlyAlphabeticTokens ? new StringTokenizer(instance.stringValue(n), this.delimiters) : new AlphabeticStringTokenizer(instance.stringValue(n));
            while (enumeration.hasMoreElements()) {
                Integer n4;
                object = (String)enumeration.nextElement();
                if (this.m_lowerCaseTokens) {
                    object = ((String)object).toLowerCase();
                }
                if ((n4 = (Integer)this.m_Dictionary.get(object = this.m_Stemmer.stem((String)object))) == null) continue;
                if (this.m_OutputCounts) {
                    number = (Double)treeMap.get(n4);
                    if (number != null) {
                        treeMap.put(n4, new Double((Double)number + 1.0));
                        continue;
                    }
                    treeMap.put(n4, new Double(1.0));
                    continue;
                }
                treeMap.put(n4, new Double(1.0));
            }
        }
        if (this.m_TFTransform) {
            Iterator iterator = treeMap.keySet().iterator();
            int n5 = 0;
            while (iterator.hasNext()) {
                object = (Integer)iterator.next();
                if ((Integer)object >= n2) {
                    double d = (Double)treeMap.get(object);
                    d = Math.log(d + 1.0);
                    treeMap.put(object, new Double(d));
                }
                ++n5;
            }
        }
        if (this.m_IDFTransform) {
            Iterator iterator = treeMap.keySet().iterator();
            int n6 = 0;
            while (iterator.hasNext()) {
                object = (Integer)iterator.next();
                if ((Integer)object >= n2) {
                    double d = (Double)treeMap.get(object);
                    treeMap.put(object, new Double(d *= Math.log((double)this.numInstances / (double)this.docsCounts[(Integer)object])));
                }
                ++n6;
            }
        }
        double[] dArray = new double[treeMap.size()];
        int[] nArray = new int[treeMap.size()];
        object = treeMap.keySet().iterator();
        int n7 = 0;
        while (object.hasNext()) {
            number = (Integer)object.next();
            Double d = (Double)treeMap.get(number);
            dArray[n7] = d;
            nArray[n7] = (Integer)number;
            ++n7;
        }
        SparseInstance sparseInstance = new SparseInstance(instance.weight(), dArray, nArray, this.outputFormatPeek().numAttributes());
        sparseInstance.setDataset(this.outputFormatPeek());
        fastVector.addElement(sparseInstance);
        return n2;
    }

    private void normalizeInstance(Instance instance, int n) throws Exception {
        int n2;
        double d = 0.0;
        if (this.avgDocLength < 0.0) {
            throw new Exception("Average document length not set.");
        }
        for (n2 = 0; n2 < instance.numValues(); ++n2) {
            if (instance.index(n2) < n) continue;
            d += instance.valueSparse(n2) * instance.valueSparse(n2);
        }
        d = Math.sqrt(d);
        for (n2 = 0; n2 < instance.numValues(); ++n2) {
            if (instance.index(n2) < n) continue;
            double d2 = instance.valueSparse(n2) * this.avgDocLength / d;
            instance.setValueSparse(n2, d2);
            if (d2 != 0.0) continue;
            System.err.println("setting value " + instance.index(n2) + " to zero.");
            --n2;
        }
    }

    public static void main(String[] stringArray) {
        StringToWordVector.runFilter(new StringToWordVector(), stringArray);
    }

    private class AlphabeticStringTokenizer
    implements Enumeration {
        private char[] str;
        int currentPos = 0;

        public AlphabeticStringTokenizer(String string) {
            this.str = new char[string.length()];
            string.getChars(0, string.length(), this.str, 0);
        }

        public boolean hasMoreElements() {
            int n;
            for (n = this.currentPos; !(n >= this.str.length || this.str[n] >= 'a' && this.str[n] <= 'z' || this.str[n] >= 'A' && this.str[n] <= 'Z'); ++n) {
            }
            this.currentPos = n;
            return n < this.str.length && (this.str[n] >= 'a' && this.str[n] <= 'z' || this.str[n] >= 'A' && this.str[n] <= 'Z');
        }

        public Object nextElement() {
            int n;
            int n2;
            for (n2 = this.currentPos; n2 < this.str.length && this.str[n2] < 'a' && this.str[n2] > 'z' && this.str[n2] < 'A' && this.str[n2] > 'Z'; ++n2) {
            }
            this.currentPos = n = n2;
            if (n2 >= this.str.length) {
                throw new NoSuchElementException("no more tokens present");
            }
            while (n < this.str.length && (this.str[n] >= 'a' && this.str[n] <= 'z' || this.str[n] >= 'A' && this.str[n] <= 'Z')) {
                ++n;
            }
            String string = new String(this.str, n2, n - this.currentPos);
            this.currentPos = n;
            return string;
        }
    }

    private class Count
    implements Serializable {
        static final long serialVersionUID = 2157223818584474321L;
        public int count;
        public int docCount;

        public Count(int n) {
            this.count = n;
        }
    }
}

