package com.kennycason.kumo.nlp;

import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.nlp.filter.CompositeFilter;
import com.kennycason.kumo.nlp.filter.Filter;
import com.kennycason.kumo.nlp.filter.StopWordFilter;
import com.kennycason.kumo.nlp.filter.WordSizeFilter;
import com.kennycason.kumo.nlp.normalize.CharacterStrippingNormalizer;
import com.kennycason.kumo.nlp.normalize.LowerCaseNormalizer;
import com.kennycason.kumo.nlp.normalize.Normalizer;
import com.kennycason.kumo.nlp.normalize.TrimToEmptyNormalizer;
import com.kennycason.kumo.nlp.tokenizer.WhiteSpaceWordTokenizer;
import com.kennycason.kumo.nlp.tokenizer.WordTokenizer;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;

/* loaded from: input_file:com/kennycason/kumo/nlp/FrequencyAnalyzer.class */
public class FrequencyAnalyzer {
    public static final String DEFAULT_ENCODING = "UTF-8";
    public static final int DEFAULT_WORD_MAX_LENGTH = 32;
    public static final int DEFAULT_WORD_MIN_LENGTH = 3;
    public static final int DEFAULT_WORD_FREQUENCIES_TO_RETURN = 50;
    public static final long DEFAULT_URL_LOAD_TIMEOUT = 3000;
    private final Set<String> stopWords = new HashSet();
    private WordTokenizer wordTokenizer = new WhiteSpaceWordTokenizer();
    private final List<Filter> filters = new ArrayList();
    private final List<Normalizer> normalizers = new ArrayList();
    private int wordFrequenciesToReturn = 50;
    private int maxWordLength = 32;
    private int minWordLength = 3;
    private String characterEncoding = "UTF-8";
    private long urlLoadTimeout = DEFAULT_URL_LOAD_TIMEOUT;

    public FrequencyAnalyzer() {
        this.normalizers.add(new TrimToEmptyNormalizer());
        this.normalizers.add(new CharacterStrippingNormalizer());
        this.normalizers.add(new LowerCaseNormalizer());
    }

    public List<WordFrequency> load(InputStream inputStream) throws IOException {
        return load(IOUtils.readLines(inputStream, this.characterEncoding));
    }

    public List<WordFrequency> load(File file) throws IOException {
        return load(new FileInputStream(file));
    }

    public List<WordFrequency> load(String str) throws IOException {
        return load(new File(str));
    }

    public List<WordFrequency> load(URL url) throws IOException {
        return load(Collections.singletonList(Jsoup.parse(url, (int) this.urlLoadTimeout).body().text()));
    }

    public List<WordFrequency> load(List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (Map.Entry<String, Integer> entry : buildWordFrequencies(list, this.wordTokenizer).entrySet()) {
            arrayList.add(new WordFrequency(entry.getKey(), entry.getValue().intValue()));
        }
        return takeTopFrequencies(arrayList);
    }

    public List<WordFrequency> loadWordFrequencies(List<WordFrequency> list) {
        return takeTopFrequencies(list);
    }

    private Map<String, Integer> buildWordFrequencies(List<String> list, WordTokenizer wordTokenizer) {
        HashMap hashMap = new HashMap();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            Iterator<String> it2 = filter(wordTokenizer.tokenize(it.next())).iterator();
            while (it2.hasNext()) {
                String normalize = normalize(it2.next());
                hashMap.put(normalize, Integer.valueOf(((Integer) hashMap.getOrDefault(normalize, 0)).intValue() + 1));
            }
        }
        return hashMap;
    }

    private List<String> filter(List<String> list) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(new StopWordFilter(this.stopWords));
        arrayList.add(new WordSizeFilter(this.minWordLength, this.maxWordLength));
        arrayList.addAll(this.filters);
        return (List) list.stream().filter(new CompositeFilter(arrayList)).collect(Collectors.toList());
    }

    private String normalize(String str) {
        String str2 = str;
        Iterator<Normalizer> it = this.normalizers.iterator();
        while (it.hasNext()) {
            str2 = it.next().normalize(str2);
        }
        return str2;
    }

    private List<WordFrequency> takeTopFrequencies(Collection<WordFrequency> collection) {
        if (collection.isEmpty()) {
            return Collections.emptyList();
        }
        List list = (List) collection.stream().sorted((v0, v1) -> {
            return v0.compareTo(v1);
        }).collect(Collectors.toList());
        return list.subList(0, Math.min(list.size(), this.wordFrequenciesToReturn));
    }

    public void setStopWords(Collection<String> collection) {
        this.stopWords.clear();
        this.stopWords.addAll(collection);
    }

    public void setWordFrequenciesToReturn(int i) {
        this.wordFrequenciesToReturn = i;
    }

    public void setMinWordLength(int i) {
        this.minWordLength = i;
    }

    public void setMaxWordLength(int i) {
        this.maxWordLength = i;
    }

    public void setWordTokenizer(WordTokenizer wordTokenizer) {
        this.wordTokenizer = wordTokenizer;
    }

    public void clearFilters() {
        this.filters.clear();
    }

    public void addFilter(Filter filter) {
        this.filters.add(filter);
    }

    public void setFilter(Filter filter) {
        this.filters.clear();
        this.filters.add(filter);
    }

    public void clearNormalizers() {
        this.normalizers.clear();
    }

    public void addNormalizer(Normalizer normalizer) {
        this.normalizers.add(normalizer);
    }

    public void setNormalizer(Normalizer normalizer) {
        this.normalizers.clear();
        this.normalizers.add(normalizer);
    }

    public void setCharacterEncoding(String str) {
        this.characterEncoding = str;
    }

    public void setUrlLoadTimeout(long j) {
        this.urlLoadTimeout = j;
    }
}
