/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.crawler.robots;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.crawler.robots.RobotsTxtParser;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.util.NamePrefixThreadFactory;
import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher;

public class RobotsTxt {
    private static final ConcurrentLog log = new ConcurrentLog(RobotsTxt.class.getName());
    protected static final String ROBOTS_TXT_PATH = "/robots.txt";
    protected static final String ROBOTS_DB_PATH_SEPARATOR = ";";
    protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(";");
    private final ConcurrentMap<String, DomSync> syncObjects;
    private final WorkTables tables;
    private final LoaderDispatcher loader;
    private final ThreadPoolExecutor threadPool;

    public RobotsTxt(WorkTables worktables, LoaderDispatcher loader, int maxActiveTheads) {
        this.threadPool = new ThreadPoolExecutor(maxActiveTheads, maxActiveTheads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(), new NamePrefixThreadFactory(RobotsTxt.class.getSimpleName()));
        this.syncObjects = new ConcurrentHashMap<String, DomSync>();
        this.tables = worktables;
        this.loader = loader;
        try {
            this.tables.getHeap("robots");
        }
        catch (IOException e) {
            try {
                this.tables.getHeap("robots").clear();
            }
            catch (IOException iOException) {
                // empty catch block
            }
        }
    }

    public void clear() throws IOException {
        log.info("clearing robots table");
        this.tables.getHeap("robots").clear();
        this.syncObjects.clear();
    }

    public void close() {
        if (this.threadPool != null) {
            this.threadPool.shutdownNow();
        }
    }

    public int size() throws IOException {
        return this.tables.getHeap("robots").size();
    }

    public RobotsTxtEntry getEntry(MultiProtocolURL theURL, ClientIdentification.Agent agent) {
        if (theURL == null) {
            throw new IllegalArgumentException();
        }
        if (!theURL.getProtocol().startsWith("http")) {
            return null;
        }
        return this.getEntry(RobotsTxt.getHostPort(theURL), agent, true);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public RobotsTxtEntry getEntry(String urlHostPort, ClientIdentification.Agent agent, boolean fetchOnlineIfNotAvailableOrNotFresh) {
        Map<String, byte[]> record;
        RobotsTxtEntry robotsTxt4Host = null;
        BEncodedHeap robotsTable = null;
        try {
            robotsTable = this.tables.getHeap("robots");
        }
        catch (IOException e1) {
            log.severe("tables not available", e1);
        }
        try {
            record = robotsTable.get(robotsTable.encodedKey(urlHostPort));
        }
        catch (SpaceExceededException e) {
            log.warn("memory exhausted", e);
            record = null;
        }
        catch (IOException e) {
            log.warn("cannot get robotstxt from table", e);
            record = null;
        }
        if (record != null) {
            robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
        }
        if (fetchOnlineIfNotAvailableOrNotFresh && robotsTxt4Host == null) {
            DomSync syncObj = (DomSync)this.syncObjects.get(urlHostPort);
            if (syncObj == null) {
                syncObj = new DomSync();
                this.syncObjects.put(urlHostPort, syncObj);
            }
            DomSync domSync = syncObj;
            synchronized (domSync) {
                try {
                    record = robotsTable.get(robotsTable.encodedKey(urlHostPort));
                }
                catch (SpaceExceededException e) {
                    log.warn("memory exhausted", e);
                    record = null;
                }
                catch (IOException e) {
                    log.warn("cannot get robotstxt from table", e);
                    record = null;
                }
                if (record != null) {
                    robotsTxt4Host = new RobotsTxtEntry(urlHostPort, record);
                }
                if (robotsTxt4Host != null) {
                    return robotsTxt4Host;
                }
                DigestURL robotsURL = RobotsTxt.robotsURL(urlHostPort);
                Response response = null;
                if (robotsURL != null) {
                    if (log.isFine()) {
                        log.fine("Trying to download the robots.txt file from URL '" + String.valueOf(robotsURL) + "'.");
                    }
                    Request request = new Request(robotsURL, null);
                    try {
                        response = this.loader.load(request, CacheStrategy.NOCACHE, null, agent);
                    }
                    catch (Throwable e) {
                        log.info("Trying to download the robots.txt file from URL '" + robotsURL.toNormalform(false) + "' failed - " + e.getMessage());
                        response = null;
                    }
                }
                if (response == null) {
                    this.processOldEntry(robotsTxt4Host, robotsURL, robotsTable);
                } else {
                    robotsTxt4Host = this.processNewEntry(robotsURL, response, agent.robotIDs());
                }
            }
        }
        return robotsTxt4Host;
    }

    public void delete(MultiProtocolURL theURL) {
        BEncodedHeap robotsTable;
        String urlHostPort = RobotsTxt.getHostPort(theURL);
        if (urlHostPort == null) {
            return;
        }
        try {
            robotsTable = this.tables.getHeap("robots");
        }
        catch (IOException e1) {
            log.severe("tables not available", e1);
            return;
        }
        if (robotsTable == null) {
            return;
        }
        try {
            robotsTable.delete(robotsTable.encodedKey(urlHostPort));
        }
        catch (IOException iOException) {
            // empty catch block
        }
    }

    public void ensureExist(MultiProtocolURL theURL, final ClientIdentification.Agent agent, boolean concurrent) {
        BEncodedHeap robotsTable;
        if (!agent.isRobot()) {
            return;
        }
        if (theURL.isLocal()) {
            return;
        }
        final String urlHostPort = RobotsTxt.getHostPort(theURL);
        if (urlHostPort == null) {
            return;
        }
        try {
            robotsTable = this.tables.getHeap("robots");
        }
        catch (IOException e1) {
            log.severe("tables not available", e1);
            return;
        }
        if (robotsTable != null && robotsTable.containsKey((Object)robotsTable.encodedKey(urlHostPort))) {
            return;
        }
        Thread t = new Thread("Robots.txt:ensureExist(" + theURL.toNormalform(true) + ")"){

            /*
             * WARNING - Removed try catching itself - possible behaviour change.
             */
            @Override
            public void run() {
                DomSync syncObj = (DomSync)RobotsTxt.this.syncObjects.get(urlHostPort);
                if (syncObj == null) {
                    syncObj = new DomSync();
                    RobotsTxt.this.syncObjects.put(urlHostPort, syncObj);
                }
                DomSync domSync = syncObj;
                synchronized (domSync) {
                    if (robotsTable.containsKey((Object)robotsTable.encodedKey(urlHostPort))) {
                        return;
                    }
                    DigestURL robotsURL = RobotsTxt.robotsURL(urlHostPort);
                    Response response = null;
                    if (robotsURL != null) {
                        if (log.isFine()) {
                            log.fine("Trying to download the robots.txt file from URL '" + String.valueOf(robotsURL) + "'.");
                        }
                        Request request = new Request(robotsURL, null);
                        try {
                            response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, agent);
                        }
                        catch (IOException e) {
                            response = null;
                        }
                    }
                    if (response == null) {
                        RobotsTxt.this.processOldEntry(null, robotsURL, robotsTable);
                    } else {
                        RobotsTxt.this.processNewEntry(robotsURL, response, agent.robotIDs());
                    }
                }
            }
        };
        if (concurrent) {
            this.threadPool.execute(t);
        } else {
            t.run();
        }
    }

    public int getActiveThreads() {
        return this.threadPool != null ? this.threadPool.getActiveCount() : 0;
    }

    private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURL robotsURL, BEncodedHeap robotsTable) {
        if (robotsTxt4Host == null) {
            robotsTxt4Host = new RobotsTxtEntry(robotsURL, new ArrayList<String>(), new ArrayList<String>(), new Date(), new Date(), null, null, Integer.valueOf(0).intValue(), null);
        } else {
            robotsTxt4Host.setLoadedDate(new Date());
        }
        int sz = robotsTable.size();
        this.addEntry(robotsTxt4Host);
        if (robotsTable.size() <= sz) {
            log.severe("new entry in robots.txt table failed, resetting database");
            try {
                this.clear();
            }
            catch (IOException iOException) {
                // empty catch block
            }
            this.addEntry(robotsTxt4Host);
        }
    }

    private RobotsTxtEntry processNewEntry(DigestURL robotsURL, Response response, String[] thisAgents) {
        boolean isBrowserAgent;
        ArrayList<Object> denyPath;
        RobotsTxtParser parserResult;
        byte[] robotsTxt = response.getContent();
        if (response.getResponseHeader().getStatusCode() == 401 || response.getResponseHeader().getStatusCode() == 403) {
            parserResult = new RobotsTxtParser(thisAgents);
            denyPath = new ArrayList<String>();
            denyPath.add("/");
        } else {
            parserResult = new RobotsTxtParser(thisAgents, robotsTxt);
            denyPath = parserResult.denyList();
        }
        String etag = response.getResponseHeader().containsKey("ETag") ? ((String)response.getResponseHeader().get("ETag")).trim() : null;
        boolean bl = isBrowserAgent = thisAgents.length == 1 && thisAgents[0].equals("Mozilla");
        if (isBrowserAgent) {
            denyPath.clear();
        }
        RobotsTxtEntry robotsTxt4Host = new RobotsTxtEntry(robotsURL, parserResult.allowList(), denyPath, new Date(), response.getResponseHeader().lastModified(), etag, parserResult.sitemap(), parserResult.crawlDelayMillis(), parserResult.agentName());
        this.addEntry(robotsTxt4Host);
        return robotsTxt4Host;
    }

    private String addEntry(RobotsTxtEntry entry2) {
        try {
            BEncodedHeap robotsTable = this.tables.getHeap("robots");
            robotsTable.insert(robotsTable.encodedKey(entry2.getHostName()), entry2.getMem());
            return entry2.getHostName();
        }
        catch (Exception e) {
            log.warn("cannot write robots.txt entry", e);
            return null;
        }
    }

    public static final String getHostPort(MultiProtocolURL theURL) {
        String host;
        int port = theURL.getPort();
        if (port == -1) {
            port = theURL.getProtocol().equalsIgnoreCase("http") ? 80 : (theURL.getProtocol().equalsIgnoreCase("https") ? 443 : 80);
        }
        if ((host = theURL.getHost()) == null) {
            return null;
        }
        StringBuilder sb = new StringBuilder(host.length() + 6);
        if (host.indexOf(58) >= 0) {
            sb.append('[').append(host).append(']');
        } else {
            sb.append(host);
        }
        sb.append(':').append(Integer.toString(port));
        return sb.toString();
    }

    public static boolean isRobotsURL(MultiProtocolURL url) {
        return url.getPath().equals(ROBOTS_TXT_PATH);
    }

    public static DigestURL robotsURL(String urlHostPort) {
        if (urlHostPort.endsWith(":80")) {
            urlHostPort = urlHostPort.substring(0, urlHostPort.length() - 3);
        }
        DigestURL robotsURL = null;
        try {
            robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + ROBOTS_TXT_PATH);
        }
        catch (MalformedURLException e) {
            log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
            robotsURL = null;
        }
        return robotsURL;
    }

    public Collection<CheckEntry> massCrawlCheck(Collection<DigestURL> rootURLs, ClientIdentification.Agent userAgent) {
        ArrayList<Future<CheckEntry>> futures = new ArrayList<Future<CheckEntry>>();
        for (DigestURL u : rootURLs) {
            futures.add(this.threadPool.submit(new CrawlCheckTask(u, userAgent)));
        }
        ArrayList<CheckEntry> results = new ArrayList<CheckEntry>();
        for (Future future : futures) {
            try {
                results.add((CheckEntry)future.get());
            }
            catch (InterruptedException e) {
                log.warn("massCrawlCheck was interrupted before retrieving all results.");
                break;
            }
            catch (ExecutionException e) {
            }
        }
        return results;
    }

    private static class DomSync {
        private DomSync() {
        }
    }

    private class CrawlCheckTask
    implements Callable<CheckEntry> {
        private final DigestURL url;
        private final ClientIdentification.Agent userAgent;

        public CrawlCheckTask(DigestURL url, ClientIdentification.Agent userAgent) {
            this.url = url;
            this.userAgent = userAgent;
        }

        @Override
        public CheckEntry call() throws Exception {
            boolean robotsAllowed;
            RobotsTxtEntry robotsEntry = RobotsTxt.this.getEntry(this.url, this.userAgent);
            boolean bl = robotsEntry == null ? true : (robotsAllowed = !robotsEntry.isDisallowed(this.url));
            if (robotsAllowed) {
                try {
                    Request request = RobotsTxt.this.loader.request(this.url, true, false);
                    Response response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, Blacklist.BlacklistType.CRAWLER, this.userAgent);
                    return new CheckEntry(this.url, robotsEntry, response, null);
                }
                catch (IOException e) {
                    return new CheckEntry(this.url, robotsEntry, null, "error response: " + e.getMessage());
                }
            }
            return new CheckEntry(this.url, robotsEntry, null, null);
        }
    }

    public static class CheckEntry {
        public final DigestURL digestURL;
        public final RobotsTxtEntry robotsTxtEntry;
        public final Response response;
        public final String error;

        public CheckEntry(DigestURL digestURL, RobotsTxtEntry robotsTxtEntry, Response response, String error) {
            this.digestURL = digestURL;
            this.robotsTxtEntry = robotsTxtEntry;
            this.response = response;
            this.error = error;
        }
    }
}

