package uk.ac.cam.ch.wwmm.oscarMEMM.memm;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
import uk.ac.cam.ch.wwmm.oscar.terms.TermSets;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;
import uk.ac.cam.ch.wwmm.oscar.types.NamedEntityType;
import uk.ac.cam.ch.wwmm.oscarrecogniser.extractedtrainingdata.ExtractedTrainingData;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/oscarMEMM/memm/PostProcessor.class */
final class PostProcessor {
    private List<NamedEntity> entities;
    private Set<NamedEntity> blocked;
    private TokenSequence tokSeq;
    private ExtractedTrainingData annotations;
    private static final Logger LOG = LoggerFactory.getLogger(PostProcessor.class);
    private static Pattern cjPattern = Pattern.compile("\\S+(ic|al|ous)");
    private static Pattern asePattern = Pattern.compile("\\S+[Aa]ses?");
    private static Pattern rnPattern = Pattern.compile(".*(tions?|ing|ed|ates?|ativ(e|e?ly)|ises?|izes?|ly[sz](is|e|ing|able)|lytic(a?ly)?|if(y|ies)|ments?|thes(is|es))");
    private static Pattern oxidationStatePattern = Pattern.compile("\\((o|i{1,4}|i{0,3}[xv]|[xv]i{0,4})\\)", 2);
    private static boolean noPC = false;

    public PostProcessor(TokenSequence tokenSequence, List<NamedEntity> list, ExtractedTrainingData extractedTrainingData) {
        this.tokSeq = tokenSequence;
        this.entities = list;
        this.annotations = extractedTrainingData;
    }

    public int filterEntity(NamedEntity namedEntity) {
        return filterEntity(namedEntity.getSurface(), namedEntity.getType());
    }

    public int filterEntity(String str, NamedEntityType namedEntityType) {
        String replaceAll = str.replaceAll("\\s+", " ");
        if (!replaceAll.matches(".*[a-zA-Z].*") && !NamedEntityType.LOCANTPREFIX.isInstance(namedEntityType)) {
            return 1;
        }
        if (NamedEntityType.ASE.isInstance(namedEntityType) && !asePattern.matcher(replaceAll).matches()) {
            return 2;
        }
        if (NamedEntityType.ADJECTIVE.isInstance(namedEntityType) && !cjPattern.matcher(replaceAll).matches()) {
            return 3;
        }
        if (NamedEntityType.REACTION.isInstance(namedEntityType) && !rnPattern.matcher(replaceAll).matches() && replaceAll.matches(".*[a-zA-Z].*")) {
            return 4;
        }
        if (NamedEntityType.LOCANTPREFIX.isInstance(namedEntityType) && !replaceAll.matches(".+[-‐‑‒–—―]")) {
            return 5;
        }
        if ((NamedEntityType.LOCANTPREFIX.isInstance(namedEntityType) || NamedEntityType.ADJECTIVE.isInstance(namedEntityType) || NamedEntityType.ASE.isInstance(namedEntityType)) && replaceAll.matches(".+ .+")) {
            return 6;
        }
        if (TermSets.getDefaultInstance().getClosedClass().contains(replaceAll)) {
            return 7;
        }
        if (replaceAll.endsWith(",") || replaceAll.endsWith(".")) {
            return 8;
        }
        if (replaceAll.matches("(\\.|\\,|:|\\d+(\\.\\d+)?|=|at|is|has|with|are|on|of|and|or|were|in|as|was)\\s+.*") || replaceAll.matches(".*\\s+(\\.|\\,|:|=|at|is|has|with|are|on|of|and|or|were|in|as|was)")) {
            return 9;
        }
        if (oxidationStatePattern.matcher(replaceAll).matches()) {
            return 10;
        }
        if (!StringTools.bracketsAreBalanced(replaceAll) && replaceAll.matches(".*\\s.*")) {
            return 11;
        }
        if (namedEntityType.getName().length() >= 4 || noPC || !this.annotations.getNonChemicalWords().contains(replaceAll)) {
            return TermSets.getDefaultInstance().getStopWords().contains(replaceAll) ? 13 : 0;
        }
        return 12;
    }

    public void filterEntities() {
        for (NamedEntity namedEntity : new ArrayList(this.entities)) {
            if (filterEntity(namedEntity) > 0) {
                this.entities.remove(namedEntity);
            }
        }
    }

    public Set<NamedEntity> getBlocked() {
        if (this.blocked != null) {
            return this.blocked;
        }
        this.blocked = new HashSet();
        HashSet hashSet = new HashSet();
        for (NamedEntity namedEntity : getSorted()) {
            boolean z = false;
            for (int index = namedEntity.getTokens().get(0).getIndex(); index <= namedEntity.getTokens().get(namedEntity.getTokens().size() - 1).getIndex(); index++) {
                if (hashSet.contains(Integer.valueOf(index))) {
                    z = true;
                }
            }
            if (z) {
                this.blocked.add(namedEntity);
                namedEntity.setBlocked(true);
            } else {
                for (int index2 = namedEntity.getTokens().get(0).getIndex(); index2 <= namedEntity.getTokens().get(namedEntity.getTokens().size() - 1).getIndex(); index2++) {
                    hashSet.add(Integer.valueOf(index2));
                }
            }
        }
        return this.blocked;
    }

    public void removeBlocked() {
        for (NamedEntity namedEntity : getBlocked()) {
            LOG.debug("Removing: " + namedEntity);
            this.entities.remove(namedEntity);
        }
    }

    private List<NamedEntity> getSorted() {
        ArrayList arrayList = new ArrayList(this.entities);
        Collections.sort(arrayList, Collections.reverseOrder(new Comparator<NamedEntity>() { // from class: uk.ac.cam.ch.wwmm.oscarMEMM.memm.PostProcessor.1
            @Override // java.util.Comparator
            public int compare(NamedEntity namedEntity, NamedEntity namedEntity2) {
                return Double.compare(namedEntity.getConfidence(), namedEntity2.getConfidence());
            }
        }));
        return arrayList;
    }

    public List<NamedEntity> getEntities() {
        return this.entities;
    }
}
