/*
 * Decompiled with CFR 0.152.
 */
package com.qianxinyao.analysis.jieba.keyword;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.qianxinyao.analysis.jieba.keyword.Keyword;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class TFIDFAnalyzer {
    static HashMap<String, Double> idfMap;
    static HashSet<String> stopWordsSet;
    static double idfMedian;

    public List<Keyword> analyze(String content, int topN) {
        ArrayList<Keyword> keywordList = new ArrayList<Keyword>();
        if (stopWordsSet == null) {
            stopWordsSet = new HashSet();
            this.loadStopWords(stopWordsSet, this.getClass().getResourceAsStream("/stop_words.txt"));
        }
        if (idfMap == null) {
            idfMap = new HashMap();
            this.loadIDFMap(idfMap, this.getClass().getResourceAsStream("/idf_dict.txt"));
        }
        Map<String, Double> tfMap = this.getTF(content);
        for (String word : tfMap.keySet()) {
            if (idfMap.containsKey(word)) {
                keywordList.add(new Keyword(word, idfMap.get(word) * tfMap.get(word)));
                continue;
            }
            keywordList.add(new Keyword(word, idfMedian * tfMap.get(word)));
        }
        Collections.sort(keywordList);
        if (keywordList.size() > topN) {
            int num = keywordList.size() - topN;
            for (int i = 0; i < num; ++i) {
                keywordList.remove(topN);
            }
        }
        return keywordList;
    }

    private Map<String, Double> getTF(String content) {
        HashMap<String, Double> tfMap = new HashMap<String, Double>();
        if (content == null || content.equals("")) {
            return tfMap;
        }
        JiebaSegmenter segmenter = new JiebaSegmenter();
        List<String> segments = segmenter.sentenceProcess(content);
        HashMap<String, Integer> freqMap = new HashMap<String, Integer>();
        int wordSum = 0;
        for (String segment : segments) {
            if (stopWordsSet.contains(segment) || segment.length() <= 1) continue;
            ++wordSum;
            if (freqMap.containsKey(segment)) {
                freqMap.put(segment, (Integer)freqMap.get(segment) + 1);
                continue;
            }
            freqMap.put(segment, 1);
        }
        for (String word : freqMap.keySet()) {
            tfMap.put(word, (double)((Integer)freqMap.get(word)).intValue() * 0.1 / (double)wordSum);
        }
        return tfMap;
    }

    private void loadStopWords(Set<String> set, InputStream in) {
        try {
            BufferedReader bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                set.add(line.trim());
            }
            try {
                bufr.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    private void loadIDFMap(Map<String, Double> map, InputStream in) {
        try {
            BufferedReader bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                String[] kv = line.trim().split(" ");
                map.put(kv[0], Double.parseDouble(kv[1]));
            }
            try {
                bufr.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
            ArrayList<Double> idfList = new ArrayList<Double>(map.values());
            Collections.sort(idfList);
            idfMedian = (Double)idfList.get(idfList.size() / 2);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        String content = "\u701b\u2541\u74d9\u6d93\u5a41\u7c21\u9a9e\u714e\u52b9\u9365\ufffd \u7039\u590a\u53cf\u95c3\u53c9\u5ac4\u93c1\u6b12\u505b\u7455\u4f78\u4edb\u6fc2\ufffd";
        int topN = 5;
        TFIDFAnalyzer tfidfAnalyzer = new TFIDFAnalyzer();
        List<Keyword> list = tfidfAnalyzer.analyze(content, topN);
        for (Keyword word : list) {
            System.out.print(word.getName() + ":" + word.getTfidfvalue() + ",");
        }
    }
}

