package uk.ac.cam.ch.wwmm.oscarrecogniser.finder;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.collections.set.UnmodifiableSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.cam.ch.wwmm.oscar.chemnamedict.core.ChemNameDictRegistry;
import uk.ac.cam.ch.wwmm.oscar.document.NamedEntity;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
import uk.ac.cam.ch.wwmm.oscar.ont.OntologyTerms;
import uk.ac.cam.ch.wwmm.oscar.terms.TermSets;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;
import uk.ac.cam.ch.wwmm.oscar.types.NamedEntityType;
import uk.ac.cam.ch.wwmm.oscarrecogniser.tokenanalysis.NGram;
import uk.ac.cam.ch.wwmm.oscarrecogniser.tokenanalysis.PrefixFinder;
import uk.ac.cam.ch.wwmm.oscarrecogniser.tokenanalysis.TokenSuffixClassifier;
import uk.ac.cam.ch.wwmm.oscartokeniser.TokenClassifier;
import uk.ac.cam.ch.wwmm.oscartokeniser.Tokeniser;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/oscarrecogniser/finder/DFANEFinder.class */
public class DFANEFinder extends DFAFinder {
    private final Logger logger = LoggerFactory.getLogger(DFANEFinder.class);
    private TokenClassifier tokenClassifier;
    private Map<String, NamedEntityType> neTerms;
    private UnmodifiableSet registryNames;
    private static final long serialVersionUID = -3307600610608772402L;
    private static DFANEFinder defaultInstance;
    public static Pattern P_TWO_ADJACENT_LOWERCASE = Pattern.compile("[a-z][a-z]");
    public static Pattern P_UPPERCASE_LETTER = Pattern.compile("[A-Z]");
    private static final String REP_CM_NON_WORD = "$CMNONWORD";
    private static final String REP_ONT_WORD = "$ONTWORD";
    private static final String REP_IN_CND = "$INCND";
    private static final String REP_POLY_BRACKET_ = "$polybracket-";
    private static final String REP_POLY_ = "$poly-";
    private static final String REP_OPEN_BRACKET = "$-(-";
    private static final String REP_ENDS_IN_ELEMENT = "$ENDSINEM";
    private static final String REP_ELEMENT = "$EM";
    private static final String REP_PREFIX_BODY = "$PREFIXBODY";
    private static final String REP_HYPH = "$HYPH";
    private static final String REP_DOTS = "$DOTS";
    private static final String REP_STOP = "$STOP";
    private static final String REP_CPR_FORMULA = "$CPR_FORMULA";

    public static synchronized DFANEFinder getDefaultInstance() {
        if (defaultInstance == null) {
            defaultInstance = new DFANEFinder(TermMaps.getInstance().getNeTerms(), TokenClassifier.getDefaultInstance(), OntologyTerms.getDefaultInstance(), (UnmodifiableSet) UnmodifiableSet.decorate(ChemNameDictRegistry.getDefaultInstance().getAllNames()));
        }
        return defaultInstance;
    }

    @Deprecated
    public static void reinitialise() {
        defaultInstance = null;
        getDefaultInstance();
    }

    @Deprecated
    public static void destroyInstance() {
        defaultInstance = null;
    }

    public static void destroyInstanceIfWordTokenises(String str) {
        if (defaultInstance != null && Tokeniser.getDefaultInstance().tokenise(str).getTokens().size() > 1) {
            defaultInstance = null;
        }
    }

    public DFANEFinder(Map<String, NamedEntityType> map, TokenClassifier tokenClassifier, OntologyTerms ontologyTerms, UnmodifiableSet unmodifiableSet) {
        this.logger.debug("Initialising DFA NE Finder...");
        this.neTerms = new HashMap(map);
        this.tokenClassifier = tokenClassifier;
        this.ontologyTerms = ontologyTerms;
        this.registryNames = unmodifiableSet;
        super.init();
        this.logger.debug("Initialised DFA NE Finder");
    }

    @Override // uk.ac.cam.ch.wwmm.oscarrecogniser.finder.DFAFinder
    protected void loadTerms() {
        TermMaps termMaps = TermMaps.getInstance();
        this.logger.debug("Adding terms to DFA finder...");
        for (String str : this.neTerms.keySet()) {
            addNamedEntity(str, this.neTerms.get(str), true);
        }
        this.logger.debug("Adding ontology terms to DFA finder...");
        Iterator<String> it = this.ontologyTerms.getAllTerms().iterator();
        while (it.hasNext()) {
            addNamedEntity(it.next(), NamedEntityType.ONTOLOGY, false);
        }
        this.logger.debug("Adding custom NEs ...");
        Iterator<String> it2 = termMaps.getCustEnt().keySet().iterator();
        while (it2.hasNext()) {
            addNamedEntity(it2.next(), NamedEntityType.CUSTOM, true);
        }
        this.logger.debug("Adding names from ChemNameDict to DFA finder...");
        Iterator it3 = this.registryNames.iterator();
        while (it3.hasNext()) {
            Object next = it3.next();
            if (next instanceof String) {
                addNamedEntity((String) next, NamedEntityType.COMPOUND, false);
            }
        }
    }

    public List<NamedEntity> findNamedEntities(TokenSequence tokenSequence, NGram nGram, double d) {
        NECollector nECollector = new NECollector();
        findItems(tokenSequence, generateTokenRepresentations(tokenSequence, nGram, d), nECollector);
        return nECollector.getNes();
    }

    List<RepresentationList> generateTokenRepresentations(TokenSequence tokenSequence, NGram nGram, double d) {
        ArrayList arrayList = new ArrayList();
        Iterator<Token> it = tokenSequence.getTokens().iterator();
        while (it.hasNext()) {
            arrayList.add(generateTokenRepresentations(it.next(), nGram, d));
        }
        return arrayList;
    }

    protected RepresentationList generateTokenRepresentations(Token token, NGram nGram, double d) {
        String str;
        RepresentationList representationList = new RepresentationList();
        String surface = token.getSurface();
        representationList.addRepresentation(surface);
        String normaliseName = StringTools.normaliseName(surface);
        if (!normaliseName.equals(surface)) {
            representationList.addRepresentation(normaliseName);
        }
        representationList.addRepresentations(getSubReRepsForToken(surface));
        if (surface.length() == 1) {
            if (StringTools.isHyphen(surface)) {
                representationList.addRepresentation(REP_HYPH);
            } else if (StringTools.isMidElipsis(surface)) {
                representationList.addRepresentation(REP_DOTS);
            }
        }
        for (NamedEntityType namedEntityType : this.tokenClassifier.classifyToken(surface)) {
            if (!NamedEntityType.PROPERNOUN.equals(namedEntityType) || !surface.matches("[A-Z][a-z]+") || !TermSets.getDefaultInstance().getUsrDictWords().contains(surface.toLowerCase()) || TermSets.getDefaultInstance().getUsrDictWords().contains(surface)) {
                representationList.addRepresentation("$" + namedEntityType.getName());
            }
        }
        boolean z = false;
        Matcher matcher = PrefixFinder.prefixPattern.matcher(surface);
        if (surface.length() >= 2 && matcher.matches()) {
            String group = matcher.group(matcher.groupCount());
            String normaliseName2 = StringTools.normaliseName(group);
            if (group == null || group.equals("")) {
                representationList.addRepresentation("$" + NamedEntityType.LOCANTPREFIX.getName());
            } else {
                if (isChemicalFormula(group)) {
                    representationList.addRepresentation(REP_CPR_FORMULA);
                }
                if ((TermSets.getDefaultInstance().getStopWords().contains(normaliseName2) || TermSets.getDefaultInstance().getClosedClass().contains(normaliseName2)) && !isElement(normaliseName2)) {
                    z = true;
                }
            }
        }
        if (isPrefixBody(surface)) {
            representationList.addRepresentation(REP_PREFIX_BODY);
        }
        if (isElement(normaliseName)) {
            representationList.addRepresentation(REP_ELEMENT);
        }
        if (isEndingWithElementName(surface)) {
            representationList.addRepresentation(REP_ENDS_IN_ELEMENT);
        }
        if (!z && surface.length() > 3 && surface.matches(".*[a-z][a-z].*")) {
            if ((this.registryNames.contains(surface) ? 100.0d : (TermSets.getDefaultInstance().getUsrDictWords().contains(normaliseName) || TermSets.getDefaultInstance().getUsrDictWords().contains(surface)) ? -100.0d : nGram.testWord(surface)) > d) {
                representationList.addRepresentation("$" + TokenSuffixClassifier.classifyBySuffix(surface).getName());
                if (surface.startsWith(HelpFormatter.DEFAULT_OPT_PREFIX)) {
                    representationList.addRepresentation("$-" + TokenSuffixClassifier.classifyBySuffix(surface).getName());
                }
                if (surface.endsWith(HelpFormatter.DEFAULT_OPT_PREFIX)) {
                    representationList.addRepresentation("$" + TokenSuffixClassifier.classifyBySuffix(surface).getName() + HelpFormatter.DEFAULT_OPT_PREFIX);
                }
                String str2 = surface;
                while (true) {
                    str = str2;
                    if (!str.endsWith(")") && !str.endsWith("]")) {
                        break;
                    }
                    str2 = str.substring(0, str.length() - 1);
                }
                TermMaps termMaps = TermMaps.getInstance();
                for (int i = 1; i < str.length(); i++) {
                    if (termMaps.getSuffixes().contains(str.substring(i))) {
                        representationList.addRepresentation("$-" + str.substring(i));
                    }
                }
                if (surface.contains("(") && !surface.contains(")")) {
                    representationList.addRepresentation(REP_OPEN_BRACKET);
                }
                if (surface.matches("[Pp]oly.+")) {
                    representationList.addRepresentation(REP_POLY_);
                }
                if (surface.matches("[Pp]oly[\\(\\[\\{].+")) {
                    representationList.addRepresentation(REP_POLY_BRACKET_);
                }
            }
        }
        if (this.registryNames.contains(surface)) {
            representationList.addRepresentation(REP_IN_CND);
        }
        if (this.ontologyTerms.containsTerm(normaliseName)) {
            representationList.addRepresentation(REP_ONT_WORD);
        }
        if ((TermSets.getDefaultInstance().getStopWords().contains(normaliseName) || TermSets.getDefaultInstance().getClosedClass().contains(normaliseName)) && !isElement(normaliseName)) {
            representationList.addRepresentation(REP_STOP);
        }
        return representationList;
    }

    private boolean isChemicalFormula(String str) {
        return this.tokenClassifier.isTokenLevelRegexMatch(str, "formulaRegex");
    }

    private boolean hasCapitalLetter(String str) {
        return P_UPPERCASE_LETTER.matcher(str).find();
    }

    private boolean hasTwoAdjacentLowerCaseLetters(String str) {
        return P_TWO_ADJACENT_LOWERCASE.matcher(str).find();
    }

    private boolean isEndingWithElementName(String str) {
        return TermSets.getDefaultInstance().getEndingInElementNamePattern().matcher(str).matches();
    }

    private boolean isElement(String str) {
        return TermSets.getDefaultInstance().getElements().contains(str);
    }

    private boolean isPrefixBody(String str) {
        return PrefixFinder.prefixBody.matcher(str).matches();
    }
}
