/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.document.parser.html;

import java.awt.Dimension;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.lang.reflect.Array;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.event.EventListenerList;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.storage.SizeLimitedMap;
import net.yacy.cora.storage.SizeLimitedSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ContentScraperListener;
import net.yacy.document.parser.html.EmbedEntry;
import net.yacy.document.parser.html.Evaluation;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.document.parser.html.IconLinkRelations;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.Scraper;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.ScraperListener;
import net.yacy.document.parser.html.TagValency;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.document.parser.htmlParser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;

public class ContentScraper
extends AbstractScraper
implements Scraper {
    private static final int MAX_TAGSIZE = 0x100000;
    public static final int MAX_DOCSIZE = 0x2800000;
    private final char degree = (char)176;
    private final char[] minuteCharsHTML = "&#039;".toCharArray();
    private static final Set<String> linkTags0 = new HashSet<String>(12, 0.99f);
    private static final Set<String> linkTags1 = new HashSet<String>(15, 0.99f);
    private static final Pattern LB = Pattern.compile("\n");
    private final List<AnchorURL> anchors;
    private final SizeLimitedMap<DigestURL, String> rss;
    private final SizeLimitedMap<DigestURL, String> css;
    private final SizeLimitedMap<AnchorURL, EmbedEntry> embeds;
    private final List<ImageEntry> images;
    private final SizeLimitedSet<AnchorURL> script;
    private final SizeLimitedSet<AnchorURL> frames;
    private final SizeLimitedSet<AnchorURL> iframes;
    private final SizeLimitedSet<DigestURL> linkedDataTypes;
    private final SizeLimitedMap<String, String> metas;
    private final SizeLimitedMap<String, DigestURL> hreflang;
    private final SizeLimitedMap<String, DigestURL> navigation;
    private LinkedHashSet<String> titles;
    private final List<String> articles;
    private final List<Date> startDates;
    private final List<Date> endDates;
    private List<String>[] headlines;
    private final ClusteredScoreMap<String> bold;
    private final ClusteredScoreMap<String> italic;
    private final ClusteredScoreMap<String> underline;
    private final List<String> li;
    private final List<String> dt;
    private final List<String> dd;
    private final CharBuffer content;
    private final EventListenerList htmlFilterEventListeners;
    private double lon;
    private double lat;
    private AnchorURL canonical;
    private AnchorURL publisher;
    private final int maxAnchors;
    private final VocabularyScraper vocabularyScraper;
    private final Set<String> valencySwitchTagNames;
    private final TagValency defaultValency;
    private final int timezoneOffset;
    private int breadcrumbs;
    private final Map<DigestURL, IconEntry> icons;
    private DigestURL root;
    private final Evaluation evaluationScores;
    private boolean contentSizeLimitExceeded;
    private boolean maxAnchorsExceeded;
    private static final Pattern protp;
    private static final Pattern WHITESPACE_PATTERN;
    private static final Pattern commaSepPattern;
    private static final Pattern semicSepPattern;

    public ContentScraper(DigestURL root, int maxAnchors, int maxLinks, Set<String> valencySwitchTagNames, TagValency defaultValency, VocabularyScraper vocabularyScraper, int timezoneOffset) {
        super(linkTags0, linkTags1);
        assert (root != null);
        this.root = root;
        this.vocabularyScraper = vocabularyScraper;
        this.valencySwitchTagNames = valencySwitchTagNames;
        this.defaultValency = defaultValency;
        this.timezoneOffset = timezoneOffset;
        this.evaluationScores = new Evaluation();
        this.rss = new SizeLimitedMap(maxLinks);
        this.css = new SizeLimitedMap(maxLinks);
        this.anchors = new ArrayList<AnchorURL>();
        this.images = new ArrayList<ImageEntry>();
        this.icons = new HashMap<DigestURL, IconEntry>();
        this.embeds = new SizeLimitedMap(maxLinks);
        this.frames = new SizeLimitedSet(maxLinks);
        this.iframes = new SizeLimitedSet(maxLinks);
        this.linkedDataTypes = new SizeLimitedSet(maxLinks);
        this.metas = new SizeLimitedMap(maxLinks);
        this.hreflang = new SizeLimitedMap(maxLinks);
        this.navigation = new SizeLimitedMap(maxLinks);
        this.script = new SizeLimitedSet(maxLinks);
        this.titles = new LinkedHashSet();
        this.articles = new ArrayList<String>();
        this.startDates = new ArrayList<Date>();
        this.endDates = new ArrayList<Date>();
        this.headlines = (List[])Array.newInstance(ArrayList.class, 6);
        for (int i = 0; i < this.headlines.length; ++i) {
            this.headlines[i] = new ArrayList<String>();
        }
        this.bold = new ClusteredScoreMap(false);
        this.italic = new ClusteredScoreMap(false);
        this.underline = new ClusteredScoreMap(false);
        this.li = new ArrayList<String>();
        this.dt = new ArrayList<String>();
        this.dd = new ArrayList<String>();
        this.content = new CharBuffer(0x2800000, 1024);
        this.htmlFilterEventListeners = new EventListenerList();
        this.lon = 0.0;
        this.lat = 0.0;
        this.evaluationScores.match(Evaluation.Element.url, root.toNormalform(true));
        this.canonical = null;
        this.publisher = null;
        this.breadcrumbs = 0;
        this.contentSizeLimitExceeded = false;
        this.maxAnchorsExceeded = false;
        this.maxAnchors = maxAnchors;
    }

    public ContentScraper(DigestURL root, int maxLinks, Set<String> valencySwitchTagNames, TagValency defaultValency, VocabularyScraper vocabularyScraper, int timezoneOffset) {
        this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
    }

    @Override
    public TagValency defaultValency() {
        return this.defaultValency;
    }

    @Override
    public void finish() {
        this.content.trimToSize();
    }

    @Override
    public void scrapeText(char[] newtext0, Tag insideTag) {
        if (insideTag != null) {
            if (insideTag.tv == TagValency.IGNORE) {
                return;
            }
            if (TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name)) {
                return;
            }
        }
        int s = 0;
        char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
        this.evaluationScores.match(Evaluation.Element.text, newtext);
        block0: while (s < newtext.length) {
            int pl = 1;
            int p = CharBuffer.indexOf(newtext, s, this.degree);
            if (p < 0 && (p = CharBuffer.indexOf(newtext, s, "&deg;".toCharArray())) >= 0) {
                pl = 5;
            }
            if (p < 0) break;
            int q = CharBuffer.indexOf(newtext, p + pl, this.minuteCharsHTML);
            if (q < 0) {
                q = CharBuffer.indexOf(newtext, p + pl, "'".toCharArray());
            }
            if (q < 0) {
                q = CharBuffer.indexOf(newtext, p + pl, " E".toCharArray());
            }
            if (q < 0) {
                q = CharBuffer.indexOf(newtext, p + pl, " W".toCharArray());
            }
            if (q < 0 && newtext.length - p == 7 + pl) {
                q = newtext.length;
            }
            if (q < 0) break;
            int r = p;
            while (r-- > 1) {
                if (newtext[r] != ' ') continue;
                if (newtext[--r] == 'N') {
                    this.lat = Double.parseDouble(new String(newtext, r + 2, p - r - 2)) + Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0;
                    if (this.lon != 0.0) break block0;
                    s = q + 6;
                    continue block0;
                }
                if (newtext[r] == 'S') {
                    this.lat = -Double.parseDouble(new String(newtext, r + 2, p - r - 2)) - Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0;
                    if (this.lon != 0.0) break block0;
                    s = q + 6;
                    continue block0;
                }
                if (newtext[r] == 'E') {
                    this.lon = Double.parseDouble(new String(newtext, r + 2, p - r - 2)) + Double.parseDouble(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0;
                    if (this.lat != 0.0) break block0;
                    s = q + 6;
                    continue block0;
                }
                if (newtext[r] != 'W') break block0;
                this.lon = -Double.parseDouble(new String(newtext, r + 2, p - r - 2)) - Double.parseDouble(new String(newtext, p + 2, q - p - pl - 1)) / 60.0;
                if (this.lat != 0.0) break block0;
                s = q + 6;
                continue block0;
            }
            break block0;
        }
        String b = ContentScraper.cleanLine(ContentScraper.stripAllTags(newtext));
        if (insideTag != null && !insideTag.name.equals(TagName.a.name()) && b.length() != 0 && !SentenceReader.punctuation(b.charAt(b.length() - 1))) {
            b = b + '.';
        }
        Object[] listeners = this.htmlFilterEventListeners.getListenerList();
        ArrayList<ContentScraperListener> anchorListeners = new ArrayList<ContentScraperListener>();
        for (int i = 0; i < listeners.length; i += 2) {
            if (listeners[i] != ContentScraperListener.class) continue;
            anchorListeners.add((ContentScraperListener)listeners[i + 1]);
        }
        if (!this.maxAnchorsExceeded) {
            int maxLinksToDetect = this.maxAnchors - this.anchors.size();
            if (maxLinksToDetect < Integer.MAX_VALUE) {
                ++maxLinksToDetect;
            }
            ContentScraper.findAbsoluteURLs(b, this.anchors, anchorListeners, maxLinksToDetect);
            if (this.anchors.size() > this.maxAnchors) {
                this.maxAnchorsExceeded = true;
                this.anchors.remove(this.anchors.size() - 1);
            }
        }
        if (!b.isEmpty()) {
            this.content.append(b);
            this.content.appendSpace();
        }
    }

    public static long findAbsoluteURLs(String text, Collection<AnchorURL> urls2, Collection<ContentScraperListener> listeners, long maxURLs) {
        if (text == null) {
            return 0L;
        }
        int offset = 0;
        Matcher urlSchemeMatcher = protp.matcher(text);
        Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text);
        long detectedURLsCount = 0L;
        while (offset < text.length() && detectedURLsCount < maxURLs && urlSchemeMatcher.find(offset)) {
            boolean hasWhiteSpace;
            int schemePosition = urlSchemeMatcher.start();
            String urlString = text.substring(schemePosition, (hasWhiteSpace = whiteSpaceMatcher.find(urlSchemeMatcher.end())) ? whiteSpaceMatcher.start() : text.length());
            if (urlString.endsWith(".")) {
                urlString = urlString.substring(0, urlString.length() - 1);
            }
            urlString = ContentScraper.removeUnpairedBrackets(urlString, '(', ')');
            urlString = ContentScraper.removeUnpairedBrackets(urlString, '{', '}');
            urlString = ContentScraper.removeUnpairedBrackets(urlString, '[', ']');
            offset = schemePosition + urlString.length();
            try {
                AnchorURL url = new AnchorURL(urlString);
                ++detectedURLsCount;
                if (urls2 != null) {
                    urls2.add(url);
                }
                if (listeners == null) continue;
                for (ContentScraperListener listener : listeners) {
                    listener.anchorAdded(url.toNormalform(false));
                }
            }
            catch (MalformedURLException malformedURLException) {
            }
        }
        return detectedURLsCount;
    }

    public static void findAbsoluteURLs(String text, Collection<AnchorURL> urls2, Collection<ContentScraperListener> listeners) {
        ContentScraper.findAbsoluteURLs(text, urls2, listeners, Long.MAX_VALUE);
    }

    protected static String removeUnpairedBrackets(String str, char openingMark, char closingMark) {
        int index2;
        if (str == null) {
            return null;
        }
        String result = str;
        int depth = 0;
        int lastUnpairedOpeningIndex = -1;
        for (index2 = 0; index2 < str.length(); ++index2) {
            char ch = str.charAt(index2);
            if (ch == openingMark) {
                if (depth == 0) {
                    lastUnpairedOpeningIndex = index2;
                }
                ++depth;
            } else if (ch == closingMark && --depth == 0) {
                lastUnpairedOpeningIndex = -1;
            }
            if (depth < 0) break;
        }
        if (depth > 0) {
            if (lastUnpairedOpeningIndex >= 0) {
                result = str.substring(0, lastUnpairedOpeningIndex);
            }
        } else if (depth < 0 && index2 >= 0) {
            result = str.substring(0, index2);
        }
        return result;
    }

    private AnchorURL absolutePath(String relativePath) {
        try {
            return AnchorURL.newAnchor(this.root, relativePath);
        }
        catch (Exception e) {
            return null;
        }
    }

    private Set<DigestURL> parseMicrodataItemType(Properties tagAttributes) {
        HashSet<DigestURL> types = new HashSet<DigestURL>();
        if (tagAttributes != null && tagAttributes.getProperty("itemscope") != null) {
            Set<String> itemTypes = ContentScraper.parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
            for (String itemType : itemTypes) {
                try {
                    types.add(new DigestURL(itemType));
                }
                catch (MalformedURLException malformedURLException) {}
            }
        }
        return types;
    }

    private void checkOpts(Tag tag) {
        String classprop = tag.opts.getProperty("class", EMPTY_STRING);
        this.vocabularyScraper.check(this.root, classprop, tag.content);
        String itemprop = tag.opts.getProperty("itemprop");
        if (itemprop != null) {
            String propval = tag.opts.getProperty("content");
            if (propval == null) {
                propval = tag.opts.getProperty("datetime");
            }
            if (propval != null) {
                switch (itemprop) {
                    case "latitude": {
                        this.lat = Double.parseDouble(propval);
                        break;
                    }
                    case "longitude": {
                        this.lon = Double.parseDouble(propval);
                        break;
                    }
                    case "startDate": {
                        try {
                            Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
                            this.startDates.add(startDate);
                        }
                        catch (ParseException startDate) {}
                        break;
                    }
                    case "endDate": {
                        try {
                            Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
                            this.endDates.add(endDate);
                            break;
                        }
                        catch (ParseException parseException) {
                            // empty catch block
                        }
                    }
                }
            }
        }
    }

    public static Set<Dimension> parseSizes(String sizesAttr) {
        HashSet<Dimension> sizes = new HashSet<Dimension>();
        Set<String> tokens = ContentScraper.parseSpaceSeparatedTokens(sizesAttr);
        for (String token : tokens) {
            Matcher matcher;
            if (token == null || !(matcher = IconEntry.SIZE_PATTERN.matcher(token)).matches()) continue;
            sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
        }
        return sizes;
    }

    public static Set<String> parseSpaceSeparatedTokens(String attr) {
        HashSet<String> tokens = new HashSet<String>();
        if (attr != null && !attr.trim().isEmpty()) {
            String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
            Collections.addAll(tokens, items);
        }
        return tokens;
    }

    public Set<String> retainIconRelations(Collection<String> relTokens) {
        HashSet<String> iconRels = new HashSet<String>();
        for (String token : relTokens) {
            if (!IconLinkRelations.isIconRel(token)) continue;
            iconRels.add(token.toLowerCase(Locale.ENGLISH));
        }
        return iconRels;
    }

    @Override
    public void scrapeTag0(Tag tag) {
        String lang;
        if (tag.tv == TagValency.IGNORE) {
            return;
        }
        this.checkOpts(tag);
        if (tag.name.equalsIgnoreCase("img")) {
            String src = tag.opts.getProperty("src", EMPTY_STRING);
            try {
                AnchorURL url;
                if (src.length() > 0 && (url = this.absolutePath(src)) != null) {
                    int width = NumberTools.parseIntDecSubstring(tag.opts.getProperty("width", "-1"));
                    int height = NumberTools.parseIntDecSubstring(tag.opts.getProperty("height", "-1"));
                    ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1L);
                    this.images.add(ie);
                }
            }
            catch (NumberFormatException url) {
                // empty catch block
            }
            this.evaluationScores.match(Evaluation.Element.imgpath, src);
        } else if (tag.name.equalsIgnoreCase("base")) {
            String baseHref = tag.opts.getProperty("href", EMPTY_STRING);
            if (!baseHref.isEmpty()) {
                try {
                    this.root = AnchorURL.newAnchor(this.root, baseHref);
                }
                catch (RuntimeException | MalformedURLException url) {}
            }
        } else if (tag.name.equalsIgnoreCase("frame")) {
            AnchorURL src = this.absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
            if (src != null) {
                tag.opts.put("src", src.toNormalform(true));
                src.setAll(tag.opts);
                this.frames.add(src);
                this.evaluationScores.match(Evaluation.Element.framepath, src.toNormalform(true));
            }
        } else if (tag.name.equalsIgnoreCase("body")) {
            String classprop = tag.opts.getProperty("class", EMPTY_STRING);
            this.evaluationScores.match(Evaluation.Element.bodyclass, classprop);
        } else if (tag.name.equalsIgnoreCase("meta")) {
            String content = tag.opts.getProperty("content", EMPTY_STRING);
            String name = tag.opts.getProperty("name", EMPTY_STRING);
            if (name.length() > 0) {
                this.metas.put(name.toLowerCase(), content);
                if (name.toLowerCase().equals("generator")) {
                    this.evaluationScores.match(Evaluation.Element.metagenerator, content);
                }
            }
            if ((name = tag.opts.getProperty("http-equiv", EMPTY_STRING)).length() > 0) {
                this.metas.put(name.toLowerCase(), content);
            }
            if ((name = tag.opts.getProperty("property", EMPTY_STRING)).length() > 0) {
                this.metas.put(name.toLowerCase(), content);
            }
        } else if (tag.name.equalsIgnoreCase("area")) {
            String areatitle = ContentScraper.cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
            String href = tag.opts.getProperty("href", EMPTY_STRING);
            if (href.length() > 0) {
                tag.opts.put("name", areatitle);
                AnchorURL url = this.absolutePath(href);
                if (url != null) {
                    tag.opts.put("href", url.toNormalform(true));
                    url.setAll(tag.opts);
                    this.addAnchor(url);
                }
            }
        } else if (tag.name.equalsIgnoreCase("link")) {
            String href = tag.opts.getProperty("href", EMPTY_STRING);
            AnchorURL newLink = this.absolutePath(href);
            if (newLink != null) {
                tag.opts.put("href", newLink.toNormalform(true));
                String rel = tag.opts.getProperty("rel", EMPTY_STRING);
                Set<String> relTokens = ContentScraper.parseSpaceSeparatedTokens(rel);
                String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
                String type = tag.opts.getProperty("type", EMPTY_STRING);
                String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);
                Set<String> iconRels = this.retainIconRelations(relTokens);
                if (!iconRels.isEmpty()) {
                    String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
                    Set<Dimension> sizes = ContentScraper.parseSizes(sizesAttr);
                    IconEntry icon = this.icons.get(newLink);
                    if (icon != null) {
                        icon.getRel().addAll(iconRels);
                        icon.getSizes().addAll(sizes);
                    } else {
                        icon = new IconEntry(newLink, iconRels, sizes);
                        this.icons.put(newLink, icon);
                    }
                } else if (rel.equalsIgnoreCase("canonical")) {
                    tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
                    newLink.setAll(tag.opts);
                    this.addAnchor(newLink);
                    this.canonical = newLink;
                } else if (rel.equalsIgnoreCase("publisher")) {
                    this.publisher = newLink;
                } else if (rel.equalsIgnoreCase("top") || rel.equalsIgnoreCase("up") || rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev") || rel.equalsIgnoreCase("first") || rel.equalsIgnoreCase("last")) {
                    this.navigation.put(rel, newLink);
                } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
                    this.rss.put(newLink, linktitle);
                } else if (rel.equalsIgnoreCase("alternate") && hreflang.length() > 0) {
                    this.hreflang.put(hreflang, newLink);
                } else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
                    this.css.put(newLink, rel);
                    this.evaluationScores.match(Evaluation.Element.csspath, href);
                } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
                    tag.opts.put("name", linktitle);
                    newLink.setAll(tag.opts);
                    this.addAnchor(newLink);
                }
            }
        } else if (tag.name.equalsIgnoreCase("embed") || tag.name.equalsIgnoreCase("source")) {
            String src = tag.opts.getProperty("src", EMPTY_STRING);
            try {
                AnchorURL url;
                if (src.length() > 0 && (url = this.absolutePath(src)) != null) {
                    int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
                    int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
                    tag.opts.put("src", url.toNormalform(true));
                    EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
                    this.embeds.put(url, ie);
                    url.setAll(tag.opts);
                }
            }
            catch (NumberFormatException url) {}
        } else if (tag.name.equalsIgnoreCase("param")) {
            AnchorURL url;
            String name = tag.opts.getProperty("name", EMPTY_STRING);
            if (name.equalsIgnoreCase("movie") && (url = this.absolutePath(tag.opts.getProperty("value", EMPTY_STRING))) != null) {
                tag.opts.put("value", url.toNormalform(true));
                url.setAll(tag.opts);
                this.addAnchor(url);
            }
        } else if (tag.name.equalsIgnoreCase("iframe")) {
            AnchorURL src = this.absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
            if (src != null) {
                tag.opts.put("src", src.toNormalform(true));
                src.setAll(tag.opts);
                this.iframes.add(src);
                this.evaluationScores.match(Evaluation.Element.iframepath, src.toNormalform(true));
            }
        } else if (tag.name.equalsIgnoreCase("html") && !(lang = tag.opts.getProperty("lang", EMPTY_STRING)).isEmpty()) {
            this.metas.put("dc.language", lang.substring(0, 2));
        }
        this.fireScrapeTag0(tag.name, tag.opts);
    }

    @Override
    public void scrapeTag1(Tag tag) {
        String h;
        if (tag.tv == TagValency.IGNORE) {
            return;
        }
        this.checkOpts(tag);
        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
            AnchorURL url;
            String href = tag.opts.getProperty("href", EMPTY_STRING);
            if (href.length() > 0 && (url = this.absolutePath(href)) != null) {
                if (this.followDenied()) {
                    String rel = tag.opts.getProperty("rel", EMPTY_STRING);
                    if (rel.length() == 0) {
                        rel = "nofollow";
                    } else if (rel.indexOf("nofollow") < 0) {
                        rel = rel + ",nofollow";
                    }
                    tag.opts.put("rel", rel);
                }
                tag.opts.put("text", ContentScraper.stripAllTags(tag.content.getChars()));
                tag.opts.put("href", url.toNormalform(true));
                url.setAll(tag.opts);
                this.addAnchor(url);
            }
            this.evaluationScores.match(Evaluation.Element.apath, href);
        }
        if (tag.name.equalsIgnoreCase("div")) {
            String id = tag.opts.getProperty("id", EMPTY_STRING);
            this.evaluationScores.match(Evaluation.Element.divid, id);
            String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
                ++this.breadcrumbs;
            }
        } else if (tag.name.equalsIgnoreCase("h1") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.headlines[0].add(h);
            }
        } else if (tag.name.equalsIgnoreCase("h2") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.headlines[1].add(h);
            }
        } else if (tag.name.equalsIgnoreCase("h3") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.headlines[2].add(h);
            }
        } else if (tag.name.equalsIgnoreCase("h4") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.headlines[3].add(h);
            }
        } else if (tag.name.equalsIgnoreCase("h5") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.headlines[4].add(h);
            }
        } else if (tag.name.equalsIgnoreCase("h6") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.headlines[5].add(h);
            }
        } else if (tag.name.equalsIgnoreCase("title") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            this.titles.add(h);
            this.evaluationScores.match(Evaluation.Element.title, h);
        } else if (tag.name.equalsIgnoreCase("b") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.bold.inc(h);
            }
        } else if (tag.name.equalsIgnoreCase("strong") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.bold.inc(h);
            }
        } else if (tag.name.equalsIgnoreCase("em") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.bold.inc(h);
            }
        } else if (tag.name.equalsIgnoreCase("i") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.italic.inc(h);
            }
        } else if (tag.name.equalsIgnoreCase("u") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.underline.inc(h);
            }
        } else if (tag.name.equalsIgnoreCase("li") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.li.add(h);
            }
        } else if (tag.name.equalsIgnoreCase("dt") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.dt.add(h);
            }
        } else if (tag.name.equalsIgnoreCase("dd") && tag.content.length() < 1024) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.dd.add(h);
            }
        } else if (tag.name.equalsIgnoreCase("script")) {
            String src = tag.opts.getProperty("src", EMPTY_STRING);
            if (src.length() > 0) {
                AnchorURL absoluteSrc = this.absolutePath(src);
                if (absoluteSrc != null) {
                    this.script.add(absoluteSrc);
                }
                this.evaluationScores.match(Evaluation.Element.scriptpath, src);
            } else {
                this.evaluationScores.match(Evaluation.Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
            }
        } else if (tag.name.equalsIgnoreCase("article")) {
            h = ContentScraper.cleanLine(CharacterCoding.html2unicode(ContentScraper.stripAllTags(tag.content.getChars())));
            if (h.length() > 0) {
                this.articles.add(h);
            }
        } else if (tag.name.equalsIgnoreCase(TagName.time.name()) && (h = tag.opts.getProperty("datetime")) != null) {
            try {
                Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime();
                this.startDates.add(startDate);
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
    }

    @Override
    public void scrapeAnyTagOpening(Tag tag) {
        if (tag != null && tag.tv == TagValency.EVAL && tag.opts != null) {
            this.linkedDataTypes.addAll(this.parseMicrodataItemType(tag.opts));
        }
    }

    @Override
    public TagValency tagValency(Tag tag, Tag parentTag) {
        String classAttr;
        Set<String> classes;
        if (parentTag != null && parentTag.tv != this.defaultValency) {
            return parentTag.tv;
        }
        if (this.valencySwitchTagNames != null && tag != null && (TagName.div.name().equals(tag.name) || TagName.nav.name().equals(tag.name)) && !Collections.disjoint(this.valencySwitchTagNames, classes = ContentScraper.parseSpaceSeparatedTokens(classAttr = tag.opts.getProperty("class", EMPTY_STRING)))) {
            return this.defaultValency.reverse();
        }
        return this.defaultValency;
    }

    protected void addAnchor(AnchorURL anchor) {
        if (this.anchors.size() >= this.maxAnchors) {
            this.maxAnchorsExceeded = true;
        } else {
            this.anchors.add(anchor);
            this.fireAddAnchor(anchor.toNormalform(false));
        }
    }

    @Override
    public void scrapeComment(char[] comment) {
        this.evaluationScores.match(Evaluation.Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
    }

    public List<String> getTitles() {
        String s = (String)this.metas.get("title");
        if (s != null && s.length() > 0) {
            this.titles.add(s);
        }
        if (this.titles.size() == 0) {
            for (int i = 0; i < this.headlines.length; ++i) {
                if (this.headlines[i].isEmpty()) continue;
                this.titles.add(this.headlines[i].get(0));
                break;
            }
        }
        ArrayList<String> t = new ArrayList<String>();
        t.addAll(this.titles);
        return t;
    }

    public String[] getHeadlines(int i) {
        assert (i >= 1 && i <= this.headlines.length);
        return this.headlines[i - 1].toArray(new String[this.headlines[i - 1].size()]);
    }

    public String[] getBold() {
        ArrayList<String> a = new ArrayList<String>();
        Iterator<String> i = this.bold.keys(false);
        while (i.hasNext()) {
            a.add(i.next());
        }
        return a.toArray(new String[a.size()]);
    }

    public Integer[] getBoldCount(String[] a) {
        Integer[] counter = new Integer[a.length];
        for (int i = 0; i < a.length; ++i) {
            counter[i] = this.bold.get(a[i]);
        }
        return counter;
    }

    public String[] getItalic() {
        ArrayList<String> a = new ArrayList<String>();
        Iterator<String> i = this.italic.keys(false);
        while (i.hasNext()) {
            a.add(i.next());
        }
        return a.toArray(new String[a.size()]);
    }

    public Integer[] getItalicCount(String[] a) {
        Integer[] counter = new Integer[a.length];
        for (int i = 0; i < a.length; ++i) {
            counter[i] = this.italic.get(a[i]);
        }
        return counter;
    }

    public String[] getUnderline() {
        ArrayList<String> a = new ArrayList<String>();
        Iterator<String> i = this.underline.keys(false);
        while (i.hasNext()) {
            a.add(i.next());
        }
        return a.toArray(new String[a.size()]);
    }

    public Integer[] getUnderlineCount(String[] a) {
        Integer[] counter = new Integer[a.length];
        for (int i = 0; i < a.length; ++i) {
            counter[i] = this.underline.get(a[i]);
        }
        return counter;
    }

    public String[] getLi() {
        return this.li.toArray(new String[this.li.size()]);
    }

    public String[] getDt() {
        return this.dt.toArray(new String[this.dt.size()]);
    }

    public String[] getDd() {
        return this.dd.toArray(new String[this.dd.size()]);
    }

    public List<Date> getStartDates() {
        return this.startDates;
    }

    public List<Date> getEndDates() {
        return this.endDates;
    }

    public DigestURL[] getFlash() {
        ArrayList<DigestURL> f = new ArrayList<DigestURL>();
        for (DigestURL digestURL : this.anchors) {
            String ext = MultiProtocolURL.getFileExtension(digestURL.getFileName());
            if (ext == null || !ext.equals("swf")) continue;
            f.add(digestURL);
        }
        return f.toArray(new DigestURL[f.size()]);
    }

    public boolean containsFlash() {
        for (MultiProtocolURL multiProtocolURL : this.anchors) {
            String ext = MultiProtocolURL.getFileExtension(multiProtocolURL.getFileName());
            if (ext == null || !ext.equals("swf")) continue;
            return true;
        }
        return false;
    }

    public int breadcrumbCount() {
        return this.breadcrumbs;
    }

    public String getText() {
        try {
            return this.content.trim().toString();
        }
        catch (OutOfMemoryError e) {
            ConcurrentLog.logException(e);
            return "";
        }
    }

    public List<String> getArticles() {
        return this.articles;
    }

    public List<AnchorURL> getAnchors() {
        return this.anchors;
    }

    public LinkedHashMap<DigestURL, String> getRSS() {
        return this.rss;
    }

    public Map<DigestURL, String> getCSS() {
        return this.css;
    }

    public Set<AnchorURL> getFrames() {
        return this.frames;
    }

    public Set<AnchorURL> getIFrames() {
        return this.iframes;
    }

    public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
        return this.linkedDataTypes;
    }

    public Set<AnchorURL> getScript() {
        return this.script;
    }

    public AnchorURL getCanonical() {
        return this.canonical;
    }

    public DigestURL getPublisherLink() {
        return this.publisher;
    }

    public Map<String, DigestURL> getHreflang() {
        return this.hreflang;
    }

    public Map<String, DigestURL> getNavigation() {
        return this.navigation;
    }

    public List<ImageEntry> getImages() {
        return this.images;
    }

    public Map<AnchorURL, EmbedEntry> getEmbeds() {
        return this.embeds;
    }

    public Map<String, String> getMetas() {
        return this.metas;
    }

    public Map<DigestURL, IconEntry> getIcons() {
        return this.icons;
    }

    public boolean isContentSizeLimitExceeded() {
        return this.contentSizeLimitExceeded;
    }

    public void setContentSizeLimitExceeded(boolean contentSizeLimitExceeded) {
        this.contentSizeLimitExceeded = contentSizeLimitExceeded;
    }

    public boolean isMaxAnchorsExceeded() {
        return this.maxAnchorsExceeded;
    }

    public boolean isLimitsExceeded() {
        return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded() || this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded() || this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded() || this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
    }

    public boolean indexingDenied() {
        String s = (String)this.metas.get("robots");
        if (s == null) {
            return false;
        }
        return s.indexOf("noindex", 0) >= 0;
    }

    public boolean followDenied() {
        String s = (String)this.metas.get("robots");
        if (s == null) {
            return false;
        }
        return s.indexOf("nofollow", 0) >= 0;
    }

    public List<String> getDescriptions() {
        String s = (String)this.metas.get("description");
        if (s == null) {
            s = (String)this.metas.get("dc.description");
        }
        ArrayList<String> descriptions = new ArrayList<String>();
        if (s == null) {
            return descriptions;
        }
        descriptions.add(s);
        return descriptions;
    }

    public String getContentType() {
        String s = (String)this.metas.get("content-type");
        if (s == null) {
            return EMPTY_STRING;
        }
        return s;
    }

    public String getAuthor() {
        String s = (String)this.metas.get("author");
        if (s == null) {
            s = (String)this.metas.get("dc.creator");
        }
        if (s == null) {
            return EMPTY_STRING;
        }
        return s;
    }

    public String getPublisher() {
        String s = (String)this.metas.get("copyright");
        if (s == null) {
            s = (String)this.metas.get("dc.publisher");
        }
        if (s == null) {
            return EMPTY_STRING;
        }
        return s;
    }

    public Set<String> getContentLanguages() {
        String s = (String)this.metas.get("content-language");
        if (s == null) {
            s = (String)this.metas.get("dc.language");
        }
        if (s == null) {
            return null;
        }
        HashSet<String> hs = new HashSet<String>();
        String[] cl2 = commaSepPattern.split(s);
        for (int i = 0; i < cl2.length; ++i) {
            cl2[i] = cl2[i].toLowerCase();
            int p = cl2[i].indexOf(45);
            if (p > 0) {
                cl2[i] = cl2[i].substring(0, p);
            }
            if (!ISO639.exists(cl2[i])) continue;
            hs.add(cl2[i]);
        }
        if (hs.isEmpty()) {
            return null;
        }
        return hs;
    }

    public String[] getKeywords() {
        String s = (String)this.metas.get("keywords");
        if (s == null) {
            s = (String)this.metas.get("dc.description");
        }
        if (s == null) {
            s = EMPTY_STRING;
        }
        if (s.isEmpty()) {
            return new String[0];
        }
        if (s.contains(",")) {
            return commaSepPattern.split(s);
        }
        if (s.contains(";")) {
            return semicSepPattern.split(s);
        }
        return s.split("\\s");
    }

    public int getRefreshSeconds() {
        String s = (String)this.metas.get("refresh");
        if (s == null) {
            return 9999;
        }
        try {
            int pos = s.indexOf(59);
            if (pos < 0) {
                return 9999;
            }
            int i = NumberTools.parseIntDecSubstring(s, 0, pos);
            return i;
        }
        catch (NumberFormatException e) {
            return 9999;
        }
    }

    public String getRefreshPath() {
        String s = (String)this.metas.get("refresh");
        if (s == null) {
            return EMPTY_STRING;
        }
        int pos = s.indexOf(59);
        if (pos < 0) {
            return EMPTY_STRING;
        }
        if ((s = s.substring(pos + 1).trim()).toLowerCase().startsWith("url=")) {
            return s.substring(4).trim();
        }
        return EMPTY_STRING;
    }

    public Date getDate() {
        String content = (String)this.metas.get("date");
        if (content != null) {
            try {
                return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        if ((content = (String)this.metas.get("dc.date.modified")) != null) {
            try {
                return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        if ((content = (String)this.metas.get("dc.date.created")) != null) {
            try {
                return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        if ((content = (String)this.metas.get("dc.date")) != null) {
            try {
                return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        if ((content = (String)this.metas.get("dc:date")) != null) {
            try {
                return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        if ((content = (String)this.metas.get("last-modified")) != null) {
            try {
                return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
        return new Date();
    }

    public double getLon() {
        int p;
        if (this.lon != 0.0) {
            return this.lon;
        }
        String s = (String)this.metas.get("ICBM");
        if (s != null) {
            p = s.indexOf(59);
            if (p < 0) {
                p = s.indexOf(44);
            }
            if (p < 0) {
                p = s.indexOf(32);
            }
            if (p > 0) {
                this.lat = Double.parseDouble(s.substring(0, p).trim());
                this.lon = Double.parseDouble(s.substring(p + 1).trim());
            }
        }
        if (this.lon != 0.0) {
            return this.lon;
        }
        s = (String)this.metas.get("geo.position");
        if (s != null) {
            p = s.indexOf(59);
            if (p < 0) {
                p = s.indexOf(44);
            }
            if (p < 0) {
                p = s.indexOf(32);
            }
            if (p > 0) {
                this.lat = Double.parseDouble(s.substring(0, p).trim());
                this.lon = Double.parseDouble(s.substring(p + 1).trim());
            }
        }
        return this.lon;
    }

    public double getLat() {
        if (this.lat != 0.0) {
            return this.lat;
        }
        this.getLon();
        return this.lat;
    }

    public Set<String> getEvaluationModelNames() {
        return this.evaluationScores.getModelNames();
    }

    public String[] getEvaluationModelScoreNames(String modelName) {
        ArrayList<String> a = new ArrayList<String>();
        ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
        if (scores != null) {
            Iterator<String> i = scores.keys(false);
            while (i.hasNext()) {
                a.add(i.next());
            }
        }
        return a.toArray(new String[a.size()]);
    }

    public Integer[] getEvaluationModelScoreCounts(String modelName, String[] a) {
        ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
        Integer[] counter = new Integer[a.length];
        if (scores != null) {
            for (int i = 0; i < a.length; ++i) {
                counter[i] = scores.get(a[i]);
            }
        }
        return counter;
    }

    @Override
    public void close() {
        super.close();
        this.anchors.clear();
        this.rss.clear();
        this.css.clear();
        this.script.clear();
        this.frames.clear();
        this.iframes.clear();
        this.linkedDataTypes.clear();
        this.embeds.clear();
        this.images.clear();
        this.icons.clear();
        this.metas.clear();
        this.hreflang.clear();
        this.navigation.clear();
        this.titles.clear();
        this.articles.clear();
        this.startDates.clear();
        this.endDates.clear();
        this.headlines = null;
        this.bold.clear();
        this.italic.clear();
        this.underline.clear();
        this.li.clear();
        this.dt.clear();
        this.dd.clear();
        this.content.clear();
        this.root = null;
    }

    public void print() {
        for (String t : this.titles) {
            System.out.println("TITLE    :" + t);
        }
        for (int i = 0; i < 4; ++i) {
            System.out.println("HEADLINE" + i + ":" + this.headlines[i].toString());
        }
        System.out.println("ANCHORS  :" + this.anchors.toString());
        System.out.println("IMAGES   :" + this.images.toString());
        System.out.println("METAS    :" + this.metas.toString());
        System.out.println("TEXT     :" + this.content.toString());
    }

    @Override
    public void registerHtmlFilterEventListener(ScraperListener listener) {
        if (listener != null) {
            if (listener instanceof ContentScraperListener) {
                this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
            } else {
                this.htmlFilterEventListeners.add(ScraperListener.class, listener);
            }
        }
    }

    @Override
    public void deregisterHtmlFilterEventListener(ScraperListener listener) {
        if (listener != null) {
            if (listener instanceof ContentScraperListener) {
                this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
            } else {
                this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
            }
        }
    }

    private void fireScrapeTag0(String tagname, Properties tagopts) {
        Object[] listeners = this.htmlFilterEventListeners.getListenerList();
        for (int i = 0; i < listeners.length; i += 2) {
            if (listeners[i] != ScraperListener.class && listeners[i] != ContentScraperListener.class) continue;
            ((ScraperListener)listeners[i + 1]).scrapeTag0(tagname, tagopts);
        }
    }

    private void fireScrapeTag1(String tagname, Properties tagopts, char[] text) {
        Object[] listeners = this.htmlFilterEventListeners.getListenerList();
        for (int i = 0; i < listeners.length; i += 2) {
            if (listeners[i] != ScraperListener.class && listeners[i] != ContentScraperListener.class) continue;
            ((ScraperListener)listeners[i + 1]).scrapeTag1(tagname, tagopts, text);
        }
    }

    private void fireAddAnchor(String anchorURL) {
        Object[] listeners = this.htmlFilterEventListeners.getListenerList();
        for (int i = 0; i < listeners.length; i += 2) {
            if (listeners[i] != ContentScraperListener.class) continue;
            ((ContentScraperListener)listeners[i + 1]).anchorAdded(anchorURL);
        }
    }

    public static ContentScraper parseResource(File file, int maxLinks, int timezoneOffset) throws IOException {
        byte[] page = FileUtils.read(file);
        if (page == null) {
            throw new IOException("no content in file " + file.toString());
        }
        ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page), StandardCharsets.UTF_8.name(), new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), new DigestURL("http://localhost"), false, maxLinks, timezoneOffset);
        String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
        htmlFilter.close();
        if (charset == null) {
            charset = Charset.defaultCharset().toString();
        }
        ContentScraper scraper = new ContentScraper(new DigestURL("http://localhost"), maxLinks, new HashSet<String>(), TagValency.EVAL, new VocabularyScraper(), timezoneOffset);
        TransformerWriter writer = new TransformerWriter(null, null, scraper, false);
        FileUtils.copy((InputStream)new ByteArrayInputStream(page), writer, Charset.forName(charset));
        ((Writer)writer).close();
        return scraper;
    }

    static {
        for (TagName tag : TagName.values()) {
            if (tag.type == TagType.singleton) {
                linkTags0.add(tag.name());
            }
            if (tag.type != TagType.pair) continue;
            linkTags1.add(tag.name());
        }
        protp = Pattern.compile("smb://|ftp://|http://|https://");
        WHITESPACE_PATTERN = Pattern.compile("\\s");
        commaSepPattern = Pattern.compile(" |,");
        semicSepPattern = Pattern.compile(" |;");
    }

    public static class Tag {
        public String name;
        public Properties opts;
        public CharBuffer content;
        private TagValency tv;

        public Tag(String name, TagValency defaultValency) {
            this.name = name;
            this.tv = defaultValency;
            this.opts = new Properties();
            this.content = new CharBuffer(0x100000);
        }

        public Tag(String name, TagValency defaultValency, Properties opts) {
            this.name = name;
            this.tv = defaultValency;
            this.opts = opts;
            this.content = new CharBuffer(0x100000);
        }

        public Tag(String name, TagValency defaultValency, Properties opts, CharBuffer content) {
            this.name = name;
            this.tv = defaultValency;
            this.opts = opts;
            this.content = content;
        }

        public void close() {
            this.name = null;
            this.opts = null;
            if (this.content != null) {
                this.content.close();
            }
            this.content = null;
        }

        public String toString() {
            return "<" + this.name + " " + this.opts + ">" + this.content + "</" + this.name + ">";
        }

        public boolean isIgnore() {
            return this.tv == TagValency.IGNORE;
        }

        public TagValency getValency() {
            return this.tv;
        }

        public void setValency(TagValency tv) {
            this.tv = tv;
        }
    }

    public static enum TagName {
        html(TagType.singleton),
        body(TagType.singleton),
        img(TagType.singleton),
        base(TagType.singleton),
        frame(TagType.singleton),
        meta(TagType.singleton),
        area(TagType.singleton),
        link(TagType.singleton),
        embed(TagType.singleton),
        param(TagType.singleton),
        iframe(TagType.singleton),
        source(TagType.singleton),
        a(TagType.pair),
        h1(TagType.pair),
        h2(TagType.pair),
        h3(TagType.pair),
        h4(TagType.pair),
        h5(TagType.pair),
        h6(TagType.pair),
        title(TagType.pair),
        b(TagType.pair),
        em(TagType.pair),
        strong(TagType.pair),
        u(TagType.pair),
        i(TagType.pair),
        li(TagType.pair),
        dt(TagType.pair),
        dd(TagType.pair),
        script(TagType.pair),
        span(TagType.pair),
        div(TagType.pair),
        nav(TagType.pair),
        article(TagType.pair),
        time(TagType.pair),
        style(TagType.pair);

        public TagType type;

        private TagName(TagType type) {
            this.type = type;
        }
    }

    public static enum TagType {
        singleton,
        pair;

    }
}

