package org.apache.tika.eval.core.metadata;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.eval.core.langid.LanguageIDWrapper;
import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
import org.apache.tika.eval.core.textstats.CommonTokens;
import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
import org.apache.tika.eval.core.tokens.CommonTokenResult;
import org.apache.tika.eval.core.tokens.TokenCounts;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;

/* loaded from: input_file:org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.class */
public class TikaEvalMetadataFilter extends MetadataFilter {
    public static String TIKA_EVAL_NS = "tika-eval:";
    public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numTokens");
    public static Property NUM_UNIQUE_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numUniqueTokens");
    public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numAlphaTokens");
    public static Property NUM_COMMON_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numCommonTokens");
    public static Property NUM_UNIQUE_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numUniqueAlphaTokens");
    public static Property LANGUAGE = Property.externalText(TIKA_EVAL_NS + "lang");
    public static Property LANGUAGE_CONFIDENCE = Property.externalReal(TIKA_EVAL_NS + "langConfidence");
    public static Property OUT_OF_VOCABULARY = Property.externalReal(TIKA_EVAL_NS + "oov");
    static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR;

    @Override // org.apache.tika.metadata.filter.MetadataFilter
    public void filter(Metadata metadata) throws TikaException {
        String str = metadata.get(TikaCoreProperties.TIKA_CONTENT);
        if (StringUtils.isAllBlank(str)) {
            return;
        }
        calcStats(str, metadata);
    }

    private void calcStats(String str, Metadata metadata) {
        Map<Class, Object> calculate = TEXT_STATS_CALCULATOR.calculate(str);
        TokenCounts tokenCounts = (TokenCounts) calculate.get(BasicTokenCountStatsCalculator.class);
        metadata.set(NUM_TOKENS, tokenCounts.getTotalTokens());
        metadata.set(NUM_UNIQUE_TOKENS, tokenCounts.getTotalUniqueTokens());
        CommonTokenResult commonTokenResult = (CommonTokenResult) calculate.get(CommonTokens.class);
        metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens());
        metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens());
        metadata.set(NUM_COMMON_TOKENS, commonTokenResult.getCommonTokens());
        if (commonTokenResult.getAlphabeticTokens() > 0) {
            metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV());
        } else {
            metadata.set(OUT_OF_VOCABULARY, -1.0d);
        }
        List list = (List) calculate.get(LanguageIDWrapper.class);
        if (list.size() > 0) {
            metadata.set(LANGUAGE, ((LanguageResult) list.get(0)).getLanguage());
            metadata.set(LANGUAGE_CONFIDENCE, ((LanguageResult) list.get(0)).getRawScore());
        }
    }

    static {
        ArrayList arrayList = new ArrayList();
        arrayList.add(new BasicTokenCountStatsCalculator());
        arrayList.add(new CommonTokens());
        TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(arrayList);
    }
}
