/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.document.parser;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.text.PDFTextStripper;

public class pdfParser
extends AbstractParser
implements Parser {
    public static boolean individualPages = false;
    public static String individualPagePropertyname = "page";

    public pdfParser() {
        super("Acrobat Portable Document Parser");
        this.SUPPORTED_EXTENSIONS.add("pdf");
        this.SUPPORTED_MIME_TYPES.add("application/pdf");
        this.SUPPORTED_MIME_TYPES.add("application/x-pdf");
        this.SUPPORTED_MIME_TYPES.add("application/acrobat");
        this.SUPPORTED_MIME_TYPES.add("applications/vnd.pdf");
        this.SUPPORTED_MIME_TYPES.add("text/pdf");
        this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public Document[] parse(DigestURL location, String mimeType, String charset, VocabularyScraper scraper, int timezoneOffset, InputStream source) throws Parser.Failure, InterruptedException {
        AccessPermission perm;
        PDDocument pdfDoc;
        if (!MemoryControl.request(0xC800000L, false)) {
            throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
        }
        try {
            Thread.currentThread().setPriority(1);
            MemoryUsageSetting mus = MemoryUsageSetting.setupMixed((long)0xC800000L);
            pdfDoc = PDDocument.load((InputStream)source, (MemoryUsageSetting)mus);
        }
        catch (IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
        finally {
            Thread.currentThread().setPriority(5);
        }
        if (pdfDoc.isEncrypted() && ((perm = pdfDoc.getCurrentAccessPermission()) == null || !perm.canExtractContent())) {
            try {
                pdfDoc.close();
            }
            catch (IOException iOException) {
                // empty catch block
            }
            throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }
        PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String docTitle = null;
        String docSubject = null;
        String docAuthor = null;
        String docPublisher = null;
        String docKeywordStr = null;
        Date docDate = new Date();
        if (info != null) {
            docTitle = info.getTitle();
            docSubject = info.getSubject();
            docAuthor = info.getAuthor();
            docPublisher = info.getProducer();
            if (docPublisher == null || docPublisher.isEmpty()) {
                docPublisher = info.getCreator();
            }
            docKeywordStr = info.getKeywords();
            if (info.getModificationDate() != null) {
                docDate = info.getModificationDate().getTime();
            }
        }
        info = null;
        if (docTitle == null || docTitle.isEmpty()) {
            docTitle = MultiProtocolURL.unescape(location.getFileName());
        }
        if (docTitle == null) {
            docTitle = docSubject;
        }
        String[] docKeywords = null;
        if (docKeywordStr != null) {
            docKeywords = docKeywordStr.split(" |,");
        }
        Document[] result = null;
        try {
            List<Collection<AnchorURL>> pdflinks = this.extractPdfLinks(pdfDoc);
            final PDFTextStripper stripper = new PDFTextStripper();
            if (individualPages) {
                int pagecount = pdfDoc.getNumberOfPages();
                String[] pages = new String[pagecount];
                for (int page = 1; page <= pagecount; ++page) {
                    stripper.setStartPage(page);
                    stripper.setEndPage(page);
                    pages[page - 1] = stripper.getText(pdfDoc);
                }
                assert (pages.length == pdflinks.size()) : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
                result = new Document[Math.min(pages.length, pdflinks.size())];
                String loc = location.toNormalform(true);
                for (int page = 0; page < result.length; ++page) {
                    result[page] = new Document(new AnchorURL(loc + (loc.indexOf(63) > 0 ? (char)'&' : '?') + individualPagePropertyname + '=' + (page + 1)), mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, pdfParser.singleList(docTitle), docAuthor, docPublisher, null, null, 0.0, 0.0, pages == null || page > pages.length ? new byte[]{} : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), null, null, false, docDate);
                }
            } else {
                final CharBuffer writer = new CharBuffer(0xC800000);
                byte[] contentBytes = new byte[]{};
                stripper.setEndPage(3);
                writer.append(stripper.getText(pdfDoc));
                contentBytes = writer.getBytes();
                if (pdfDoc.getNumberOfPages() > 3) {
                    stripper.setStartPage(4);
                    stripper.setEndPage(Integer.MAX_VALUE);
                    final PDDocument pdfDocC = pdfDoc;
                    Thread t = new Thread("pdfParser.getText:" + location){

                        @Override
                        public void run() {
                            try {
                                writer.append(stripper.getText(pdfDocC));
                            }
                            catch (Throwable throwable) {
                                // empty catch block
                            }
                        }
                    };
                    t.start();
                    t.join(3000L);
                    if (t.isAlive()) {
                        t.interrupt();
                    }
                    contentBytes = writer.getBytes();
                    writer.close();
                }
                HashSet<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
                for (Collection<AnchorURL> pdflinksx : pdflinks) {
                    if (pdflinksx == null) continue;
                    pdflinksCombined.addAll(pdflinksx);
                }
                result = new Document[]{new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, pdfParser.singleList(docTitle), docAuthor, docPublisher, null, null, 0.0, 0.0, contentBytes, pdflinksCombined, null, null, false, docDate)};
            }
        }
        catch (Throwable throwable) {
        }
        finally {
            try {
                pdfDoc.close();
            }
            catch (Throwable throwable) {}
        }
        pdfDoc = null;
        pdfParser.clearPdfBoxCaches();
        return result;
    }

    private List<Collection<AnchorURL>> extractPdfLinks(PDDocument pdf) {
        ArrayList<Collection<AnchorURL>> linkCollections = new ArrayList<Collection<AnchorURL>>(pdf.getNumberOfPages());
        for (PDPage page : pdf.getPages()) {
            ArrayList<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
            try {
                List annotations = page.getAnnotations();
                if (annotations != null) {
                    for (PDAnnotation pdfannotation : annotations) {
                        PDAction link;
                        if (!(pdfannotation instanceof PDAnnotationLink) || (link = ((PDAnnotationLink)pdfannotation).getAction()) == null || !(link instanceof PDActionURI)) continue;
                        PDActionURI pdflinkuri = (PDActionURI)link;
                        String uristr = pdflinkuri.getURI();
                        AnchorURL url = new AnchorURL(uristr);
                        pdflinks.add(url);
                    }
                }
            }
            catch (IOException iOException) {
                // empty catch block
            }
            linkCollections.add(pdflinks);
        }
        return linkCollections;
    }

    public static void clearPdfBoxCaches() {
        COSName.clearResources();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void main(String[] args) {
        if (args.length > 0 && args[0].length() > 0) {
            File pdfFile = new File(args[0]);
            if (pdfFile.canRead()) {
                System.out.println(pdfFile.getAbsolutePath());
                long startTime = System.currentTimeMillis();
                pdfParser parser = new pdfParser();
                Document document = null;
                FileInputStream inStream = null;
                try {
                    inStream = new FileInputStream(pdfFile);
                    document = Document.mergeDocuments(null, "application/pdf", ((AbstractParser)parser).parse(null, "application/pdf", null, new VocabularyScraper(), 0, inStream));
                }
                catch (Parser.Failure e) {
                    System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
                    ConcurrentLog.logException(e);
                }
                catch (InterruptedException e) {
                    System.err.println("Interrupted while parsing!");
                    ConcurrentLog.logException(e);
                }
                catch (NoClassDefFoundError e) {
                    System.err.println("class not found: " + e.getMessage());
                }
                catch (FileNotFoundException e) {
                    ConcurrentLog.logException(e);
                }
                finally {
                    if (inStream != null) {
                        try {
                            inStream.close();
                        }
                        catch (IOException e) {
                            System.err.println("Could not close input stream on file " + pdfFile);
                        }
                    }
                }
                System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");
                if (document == null) {
                    System.out.println("\t!!!Parsing without result!!!");
                } else {
                    System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
                    InputStream textStream = document.getTextStream();
                    try {
                        FileUtils.copy(textStream, new File("parsedPdf.txt"));
                    }
                    catch (IOException e) {
                        System.err.println("error saving parsed document");
                        ConcurrentLog.logException(e);
                    }
                    finally {
                        try {
                            if (textStream != null) {
                                textStream.close();
                            }
                        }
                        catch (IOException e) {
                            ConcurrentLog.warn("PDFPARSER", "Could not close text input stream");
                        }
                    }
                }
            } else {
                System.err.println("Cannot read file " + pdfFile.getAbsolutePath());
            }
        } else {
            System.out.println("Please give a filename as first argument.");
        }
    }
}

