/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.document.parser;

import com.ibm.icu.text.CharsetDetector;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Set;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
import org.apache.commons.io.IOUtils;

public class htmlParser
extends AbstractParser
implements Parser {
    private static final int DEFAULT_MAX_LINKS = 10000;

    public htmlParser() {
        super("Streaming HTML Parser");
        this.SUPPORTED_EXTENSIONS.add("htm");
        this.SUPPORTED_EXTENSIONS.add("html");
        this.SUPPORTED_EXTENSIONS.add("shtml");
        this.SUPPORTED_EXTENSIONS.add("shtm");
        this.SUPPORTED_EXTENSIONS.add("stm");
        this.SUPPORTED_EXTENSIONS.add("xhtml");
        this.SUPPORTED_EXTENSIONS.add("phtml");
        this.SUPPORTED_EXTENSIONS.add("phtm");
        this.SUPPORTED_EXTENSIONS.add("tpl");
        this.SUPPORTED_EXTENSIONS.add("php");
        this.SUPPORTED_EXTENSIONS.add("php2");
        this.SUPPORTED_EXTENSIONS.add("php3");
        this.SUPPORTED_EXTENSIONS.add("php4");
        this.SUPPORTED_EXTENSIONS.add("php5");
        this.SUPPORTED_EXTENSIONS.add("cfm");
        this.SUPPORTED_EXTENSIONS.add("asp");
        this.SUPPORTED_EXTENSIONS.add("aspx");
        this.SUPPORTED_EXTENSIONS.add("tex");
        this.SUPPORTED_EXTENSIONS.add("txt");
        this.SUPPORTED_EXTENSIONS.add("msg");
        this.SUPPORTED_MIME_TYPES.add("text/html");
        this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
        this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
        this.SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
        this.SUPPORTED_MIME_TYPES.add("application/x-tex");
        this.SUPPORTED_MIME_TYPES.add("application/vnd.ms-outlook");
        this.SUPPORTED_MIME_TYPES.add("text/plain");
        this.SUPPORTED_MIME_TYPES.add("text/csv");
    }

    @Override
    public Document[] parse(DigestURL location, String mimeType, String documentCharset, VocabularyScraper vocscraper, int timezoneOffset, InputStream sourceStream) throws Parser.Failure, InterruptedException {
        return this.parseWithLimits(location, mimeType, documentCharset, TagValency.EVAL, new HashSet<String>(), vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, 10000, Long.MAX_VALUE);
    }

    @Override
    public Document[] parse(DigestURL location, String mimeType, String documentCharset, TagValency defaultValency, Set<String> valencySwitchTagNames, VocabularyScraper vocscraper, int timezoneOffset, InputStream sourceStream) throws Parser.Failure, InterruptedException {
        return this.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, sourceStream, Integer.MAX_VALUE, 10000, Long.MAX_VALUE);
    }

    @Override
    public boolean isParseWithLimitsSupported() {
        return true;
    }

    @Override
    public Document[] parseWithLimits(DigestURL location, String mimeType, String documentCharset, TagValency defaultValency, Set<String> valencySwitchTagNames, VocabularyScraper vocscraper, int timezoneOffset, InputStream sourceStream, int maxLinks, long maxBytes) throws Parser.Failure {
        return this.parseWithLimits(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, sourceStream, maxLinks, maxLinks, maxBytes);
    }

    private Document[] parseWithLimits(DigestURL location, String mimeType, String documentCharset, TagValency defaultValency, Set<String> valencySwitchTagNames, VocabularyScraper vocscraper, int timezoneOffset, InputStream sourceStream, int maxAnchors, int maxLinks, long maxBytes) throws Parser.Failure {
        try {
            Document[] documentArray;
            Charset[] detectedcharsetcontainer = new Charset[]{null};
            ContentScraper scraper = htmlParser.parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, maxBytes);
            Document document = this.transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
            Document documentSnapshot = null;
            try {
                if (location.getRef() != null && location.getRef().startsWith("!")) {
                    documentSnapshot = this.parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                } else if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
                    documentSnapshot = this.parseAlternativeSnapshot(location, mimeType, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, timezoneOffset, maxAnchors, maxLinks, maxBytes);
                }
            }
            catch (Exception ex1) {
                documentSnapshot = null;
            }
            if (documentSnapshot == null) {
                Document[] documentArray2 = new Document[1];
                documentArray = documentArray2;
                documentArray2[0] = document;
            } else {
                Document[] documentArray3 = new Document[2];
                documentArray3[0] = document;
                documentArray = documentArray3;
                documentArray3[1] = documentSnapshot;
            }
            return documentArray;
        }
        catch (IOException e) {
            throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
        }
    }

    private Document transformScraper(DigestURL location, String mimeType, String charSet, ContentScraper scraper) {
        String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length + scraper.getHeadlines(5).length + scraper.getHeadlines(6).length];
        int p = 0;
        for (int i = 1; i <= 6; ++i) {
            for (String headline : scraper.getHeadlines(i)) {
                sections[p++] = headline;
            }
        }
        LinkedHashMap<DigestURL, ImageEntry> noDoubleImages = new LinkedHashMap<DigestURL, ImageEntry>();
        for (ImageEntry ie : scraper.getImages()) {
            noDoubleImages.put(ie.url(), ie);
        }
        Document ppd = new Document(location, mimeType, charSet, this, scraper.getContentLanguages(), scraper.getKeywords(), scraper.getTitles(), scraper.getAuthor(), scraper.getPublisher(), sections, scraper.getDescriptions(), scraper.getLon(), scraper.getLat(), scraper.getText(), scraper.getAnchors(), scraper.getRSS(), noDoubleImages, scraper.indexingDenied(), scraper.getDate());
        ppd.setScraperObject(scraper);
        ppd.setIcons(scraper.getIcons());
        ppd.setLinkedDataTypes(scraper.getLinkedDataTypes());
        ppd.setPartiallyParsed(scraper.isLimitsExceeded());
        return ppd;
    }

    public static ContentScraper parseToScraper(DigestURL location, String documentCharset, TagValency defaultValency, Set<String> valencySwitchTagNames, VocabularyScraper vocabularyScraper, int timezoneOffset, String input, int maxAnchors, int maxLinks) throws IOException {
        ContentScraper scraper;
        ByteArrayInputStream sourceStream;
        Charset[] detectedcharsetcontainer = new Charset[]{null};
        try {
            sourceStream = new ByteArrayInputStream(documentCharset == null ? UTF8.getBytes(input) : input.getBytes(documentCharset));
        }
        catch (UnsupportedEncodingException e) {
            sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
        }
        try {
            scraper = htmlParser.parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxAnchors, maxLinks, Long.MAX_VALUE);
        }
        catch (Parser.Failure e) {
            throw new IOException(e.getMessage());
        }
        return scraper;
    }

    public static ContentScraper parseToScraper(DigestURL location, String documentCharset, TagValency defaultValency, Set<String> valencySwitchTagNames, VocabularyScraper vocabularyScraper, Charset[] detectedcharsetcontainer, int timezoneOffset, InputStream sourceStream, int maxAnchors, int maxLinks, long maxBytes) throws Parser.Failure, IOException {
        String charset = null;
        if (documentCharset != null) {
            charset = htmlParser.patchCharsetEncoding(documentCharset);
        }
        if (charset == null) {
            try (ScraperInputStream htmlFilter = null;){
                htmlFilter = new ScraperInputStream(sourceStream, documentCharset, valencySwitchTagNames, defaultValency, vocabularyScraper, location, false, maxLinks, timezoneOffset);
                sourceStream = htmlFilter;
                charset = htmlFilter.detectCharset();
            }
        }
        if (charset == null) {
            CharsetDetector det = new CharsetDetector();
            det.enableInputFilter(true);
            BufferedInputStream detStream = new BufferedInputStream(sourceStream);
            det.setText((InputStream)detStream);
            charset = det.detect().getName();
            sourceStream = detStream;
        }
        if (charset == null) {
            detectedcharsetcontainer[0] = Charset.defaultCharset();
        } else {
            try {
                detectedcharsetcontainer[0] = Charset.forName(charset);
            }
            catch (IllegalCharsetNameException e) {
                detectedcharsetcontainer[0] = Charset.defaultCharset();
            }
            catch (UnsupportedCharsetException e) {
                detectedcharsetcontainer[0] = Charset.defaultCharset();
            }
        }
        ContentScraper scraper = new ContentScraper(location, maxAnchors, maxLinks, valencySwitchTagNames, TagValency.EVAL, vocabularyScraper, timezoneOffset);
        TransformerWriter writer = new TransformerWriter(null, null, scraper, false, Math.max(64, Math.min(4096, sourceStream.available())));
        try {
            long maxChars = (long)((float)maxBytes * detectedcharsetcontainer[0].newDecoder().averageCharsPerByte());
            InputStreamReader sourceReader = new InputStreamReader(sourceStream, detectedcharsetcontainer[0]);
            long copiedChars = IOUtils.copyLarge((Reader)sourceReader, (Writer)writer, (long)0L, (long)maxChars);
            if (copiedChars > maxChars) {
                scraper.setContentSizeLimitExceeded(true);
            } else if (copiedChars == maxChars && ((Reader)sourceReader).read() >= 0) {
                scraper.setContentSizeLimitExceeded(true);
            }
        }
        catch (IOException e) {
            throw new Parser.Failure("IO error:" + e.getMessage(), location);
        }
        finally {
            writer.flush();
            writer.close();
        }
        if (writer.binarySuspect()) {
            String errorMsg = "Binary data found in resource";
            throw new Parser.Failure("Binary data found in resource", location);
        }
        return scraper;
    }

    public static String patchCharsetEncoding(String encoding) {
        char c;
        if (encoding == null || ((String)encoding).length() < 3) {
            return null;
        }
        encoding = ((String)encoding).trim();
        if (((String)(encoding = ((String)encoding).toUpperCase(Locale.ROOT))).startsWith("SHIFT")) {
            return "Shift_JIS";
        }
        if (((String)encoding).startsWith("BIG")) {
            return "Big5";
        }
        if (((String)encoding).startsWith("WINDOWS")) {
            encoding = "windows" + ((String)encoding).substring(7);
        }
        if (((String)encoding).startsWith("MACINTOSH")) {
            encoding = "MacRoman";
        }
        if (((String)(encoding = CommonPattern.UNDERSCORE.matcher((CharSequence)encoding).replaceAll("-"))).matches("GB[_-]?2312([-_]80)?")) {
            return "GB2312";
        }
        if (((String)encoding).matches(".*UTF[-_]?8.*")) {
            return StandardCharsets.UTF_8.name();
        }
        if (((String)encoding).startsWith("US")) {
            return StandardCharsets.US_ASCII.name();
        }
        if (((String)encoding).startsWith("KOI")) {
            return "KOI8-R";
        }
        if (((String)encoding).startsWith("windows") && ((String)encoding).length() > 7 && (c = ((String)encoding).charAt(7)) >= '0' && c <= '9') {
            encoding = "windows-" + ((String)encoding).substring(7);
        }
        if (((String)encoding).startsWith("ISO")) {
            if (((String)encoding).length() > 3 && (c = ((String)encoding).charAt(3)) >= '0' && c <= '9') {
                encoding = "ISO-" + ((String)encoding).substring(3);
            }
            if (((String)encoding).length() > 8 && (c = ((String)encoding).charAt(8)) >= '0' && c <= '9') {
                encoding = ((String)encoding).substring(0, 8) + "-" + ((String)encoding).substring(8);
            }
        }
        if (((String)encoding).startsWith("ISO-8559")) {
            encoding = "ISO-8859" + ((String)encoding).substring(8);
        }
        if (((String)encoding).matches("CP([_-])?125[0-8]")) {
            c = ((String)encoding).charAt(2);
            encoding = c >= '0' && c <= '9' ? "windows-" + ((String)encoding).substring(2) : "windows" + ((String)encoding).substring(2);
        }
        return encoding;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private Document parseAlternativeSnapshot(DigestURL location, String mimeType, String documentCharset, TagValency defaultValency, Set<String> valencySwitchTagNames, VocabularyScraper vocscraper, int timezoneOffset, int maxAnchors, int maxLinks, long maxBytes) {
        Document documentSnapshot = null;
        try {
            DigestURL locationSnapshot = location.getRef() != null && !location.getRef().isEmpty() && location.getRef().startsWith("!") ? (location.getSearchpart().isEmpty() ? new DigestURL(location.toNormalform(true) + "?_escaped_fragment_=" + MultiProtocolURL.escape(location.getRef().substring(1))) : new DigestURL(location.toNormalform(true) + "&_escaped_fragment_=" + MultiProtocolURL.escape(location.getRef().substring(1)).toString())) : new DigestURL(location.toNormalform(true) + "?_escaped_fragment_=");
            Charset[] detectedcharsetcontainer = new Charset[]{null};
            InputStream snapshotStream = null;
            try {
                snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
                ContentScraper scraperSnapshot = htmlParser.parseToScraper(location, documentCharset, defaultValency, valencySwitchTagNames, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxAnchors, maxLinks, maxBytes);
                documentSnapshot = this.transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
            }
            finally {
                if (snapshotStream != null) {
                    try {
                        snapshotStream.close();
                    }
                    catch (IOException e) {
                        AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage());
                    }
                }
            }
            AbstractParser.log.info("parse snapshot " + locationSnapshot.toString() + " additional to " + location.toString());
        }
        catch (IOException | Parser.Failure exception) {
            // empty catch block
        }
        return documentSnapshot;
    }

    public static void main(String[] args) {
        try {
            DigestURL url = new DigestURL(args[0]);
            byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
            Document[] document = new htmlParser().parse(url, "text/html", StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, new ByteArrayInputStream(content));
            String title = document[0].dc_title();
            System.out.println(title);
        }
        catch (MalformedURLException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        catch (Parser.Failure e) {
            e.printStackTrace();
        }
        catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.exit(0);
    }
}

