/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.document.importer;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.TextParser;
import net.yacy.document.importer.Importer;
import net.yacy.search.Switchboard;
import net.yacy.server.http.ChunkedInputStream;
import org.jwat.common.HeaderLine;
import org.jwat.common.HttpHeader;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;

public class WarcImporter
extends Thread
implements Importer {
    public static WarcImporter job;
    private InputStream source;
    private String name;
    private int recordCnt;
    private long startTime;
    private final long sourceSize;
    private long consumed;
    private boolean abort = false;
    private String collection;

    public WarcImporter(MultiProtocolURL url, String collection) throws IOException {
        super("WarcImporter - from InputStream");
        this.recordCnt = 0;
        this.sourceSize = -1L;
        this.name = url.toNormalform(true);
        this.source = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
        if (this.name.endsWith(".gz")) {
            this.source = new GZIPInputStream(this.source);
        }
        this.collection = collection;
    }

    public WarcImporter(File f, String collection) throws IOException {
        super("WarcImporter - from file " + f.getName());
        this.name = f.getName();
        this.sourceSize = f.length();
        this.source = new FileInputStream(f);
        if (this.name.endsWith(".gz")) {
            this.source = new GZIPInputStream(this.source);
        }
        this.collection = collection;
    }

    public WarcImporter(File f, InputStream is, String collection) throws IOException {
        super("WarcImporter - from file " + f.getName());
        this.name = f.getName();
        if (!f.exists() && is != null) {
            this.sourceSize = is.available();
            this.source = is;
        } else {
            this.sourceSize = f.length();
            this.source = new FileInputStream(f);
            if (this.name.endsWith(".gz")) {
                this.source = new GZIPInputStream(this.source);
            }
        }
        this.collection = collection;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void indexWarcRecords(InputStream f) throws IOException {
        job = this;
        this.startTime = System.currentTimeMillis();
        CrawlProfile warcProfile = (CrawlProfile)Switchboard.getSwitchboard().crawler.defaultPackProfile.clone();
        warcProfile.setCollections(this.collection);
        warcProfile.setHandle();
        WarcReader localwarcReader = WarcReaderFactory.getReader((InputStream)f);
        WarcRecord wrec = localwarcReader.getNextRecord();
        while (wrec != null && !this.abort) {
            HeaderLine hl = wrec.getHeader("WARC-Type");
            if (hl != null && hl.value.equals("response")) {
                hl = wrec.getHeader("WARC-Target-URI");
                String url = hl.value;
                if (url.startsWith("<") && url.endsWith(">")) {
                    url = url.substring(1, url.length() - 1);
                }
                DigestURL location = new DigestURL(url);
                HttpHeader http = wrec.getHttpHeader();
                if (http != null && http.statusCode == 200 && TextParser.supportsMime(http.contentType) == null) {
                    InputStream istream = wrec.getPayloadContent();
                    hl = http.getHeader("Transfer-Encoding");
                    byte[] content = null;
                    try {
                        if (hl != null && hl.value.contains("chunked")) {
                            int c;
                            istream = new ChunkedInputStream(istream);
                            ByteBuffer bbuffer = new ByteBuffer();
                            while ((c = istream.read()) >= 0) {
                                bbuffer.append(c);
                            }
                            content = bbuffer.getBytes();
                            bbuffer.close();
                        } else {
                            content = new byte[(int)http.getPayloadLength()];
                            istream.read(content, 0, content.length);
                        }
                        RequestHeader requestHeader = new RequestHeader();
                        ResponseHeader responseHeader = new ResponseHeader(http.statusCode);
                        for (HeaderLine hx : http.getHeaderList()) {
                            responseHeader.put(hx.name, hx.value);
                        }
                        Request request = new Request(ASCII.getBytes(Switchboard.getSwitchboard().peers.mySeed().hash), location, requestHeader.referer() == null ? null : requestHeader.referer().hash(), "warc", responseHeader.lastModified(), warcProfile.handle(), 0, warcProfile.timezoneOffset());
                        Response response = new Response(request, requestHeader, responseHeader, warcProfile, false, content);
                        String error = Switchboard.getSwitchboard().toIndexer(response);
                        if (error != null) {
                            ConcurrentLog.info("WarcImporter", "error parsing: " + error);
                        }
                    }
                    catch (IOException e) {
                        ConcurrentLog.info("WarcImporter", "error reading: " + e.getMessage());
                    }
                    finally {
                        try {
                            istream.close();
                        }
                        catch (IOException iOException) {}
                    }
                    ++this.recordCnt;
                }
            }
            this.consumed = localwarcReader.getConsumed();
            wrec = localwarcReader.getNextRecord();
        }
        localwarcReader.close();
        ConcurrentLog.info("WarcImporter", "Indexed " + this.recordCnt + " documents");
        job = null;
    }

    @Override
    public void run() {
        try {
            this.indexWarcRecords(this.source);
        }
        catch (IOException ex) {
            ConcurrentLog.info("WarcImporter", ex.getMessage());
        }
    }

    public void quit() {
        this.abort = true;
    }

    @Override
    public String source() {
        return this.name;
    }

    @Override
    public int count() {
        return this.recordCnt;
    }

    @Override
    public int speed() {
        if (this.recordCnt == 0) {
            return 0;
        }
        return (int)((long)this.recordCnt / Math.max(0L, this.runningTime()));
    }

    @Override
    public long runningTime() {
        return (System.currentTimeMillis() - this.startTime) / 1000L;
    }

    @Override
    public long remainingTime() {
        if (this.consumed == 0L) {
            return 0L;
        }
        long speed = this.consumed / this.runningTime();
        return (this.sourceSize - this.consumed) / speed;
    }

    @Override
    public String status() {
        return "";
    }
}

