/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.document.parser.html;

import java.text.ParseException;
import java.time.DateTimeException;
import java.time.LocalDate;
import java.time.Month;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.date.CustomISO8601Formatter;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.storage.SizeLimitedMap;
import net.yacy.cora.util.ConcurrentLog;

public class ContentScraperDateUtil {
    private static final ConcurrentLog log = new ConcurrentLog("SCRAPER_DATE");
    private static final Pattern URL_DATE_REGEX = Pattern.compile("/(19\\d{2}|20\\d{2})[-/]?(0[1-9]|1[0-2]|January|February|March|April|May|June|July|August|September|October|November|December)((?:[-/]?(0[1-9]|[12]\\d|3[01]))?)/", 2);

    public static Date getDate(DigestURL root, SizeLimitedMap<String, String> metas, int timezoneOffset, List<Date> startDates, Date lastModified) {
        Matcher dateMatcher;
        Date currentDate = new Date();
        AtomicReference<Date> date = new AtomicReference<Date>();
        ContentScraperDateUtil.parseDate("<script id=\"schema\" type=\"application/ld+json\">{...,\"datePublished\":\"2023-07-10T14:40:52+02:00\",..}</script>", (String)metas.get("script.datepublished"), timezoneOffset, date);
        ContentScraperDateUtil.parseDate("<meta name=\"article:published_time\" content=\"YYYY-MM-DD...\" />", (String)metas.get("article:published_time"), timezoneOffset, date);
        ContentScraperDateUtil.parseDate("<meta name=\"DC.date.issued\" content=\"YYYY-MM-DD...\" />", (String)metas.get("dc.date.issued"), timezoneOffset, date);
        ContentScraperDateUtil.parseDate("<meta name=\"DC.date.modified\" content=\"YYYY-MM-DD...\" />", (String)metas.get("dc.date.modified"), timezoneOffset, date);
        ContentScraperDateUtil.parseDate("<meta name=\"DC.date.created\" content=\"YYYY-MM-DD...\" />", (String)metas.get("dc.date.created"), timezoneOffset, date);
        ContentScraperDateUtil.parseDate("<meta name=\"DC.date\" content=\"YYYY-MM-DD...\" />", (String)metas.get("dc.date"), timezoneOffset, date);
        String content = root.toString();
        if (date.get() == null && content != null && (dateMatcher = URL_DATE_REGEX.matcher(content)).find()) {
            int dayValue;
            int monthValue;
            int year = Integer.parseInt(dateMatcher.group(1));
            String monthPart = dateMatcher.group(2);
            String dayPart = dateMatcher.group(4);
            try {
                monthValue = Integer.parseInt(monthPart);
            }
            catch (NumberFormatException e) {
                monthValue = Month.valueOf(monthPart.toUpperCase(Locale.US)).getValue();
            }
            if (dayPart != null && !dayPart.isEmpty()) {
                dayValue = Integer.parseInt(dayPart);
            } else {
                dayValue = 1;
                log.info("Day part missing, deduced as the first day of the month in URL: +'" + content + "'");
            }
            try {
                LocalDate parsedDate = LocalDate.of(year, monthValue, dayValue);
                return CustomISO8601Formatter.CUSTOM_FORMATTER.parse(parsedDate.format(DateTimeFormatter.ISO_DATE), timezoneOffset).getTime();
            }
            catch (ParseException | DateTimeException e) {
                log.warn("Error: " + e.getMessage() + " (probably invalid day for month)");
            }
        }
        if (date.get() == null && lastModified != null) {
            date.set(lastModified);
        }
        if (date.get() == null) {
            date.set(ContentScraperDateUtil.findMostFrequentDate(startDates, currentDate));
            if (date.get() != null) {
                log.info("Publish date found in startDates in the page content with value: '" + String.valueOf(date) + "'");
                return date.get();
            }
        } else {
            return date.get();
        }
        log.info("Publish date not found, current date used: '" + String.valueOf(currentDate) + "'");
        return currentDate;
    }

    private static void parseDate(String tag, String date, int timezoneOffset, AtomicReference<Date> result) {
        if (result.get() != null) {
            return;
        }
        if (date != null) {
            try {
                log.info("Publish date found according to: '" + tag + "' pattern with value: '" + date + "'");
                result.set(CustomISO8601Formatter.CUSTOM_FORMATTER.parse(date, timezoneOffset).getTime());
            }
            catch (ParseException parseException) {
                // empty catch block
            }
        }
    }

    private static Date findMostFrequentDate(List<Date> dates, Date currentDate) {
        if (dates == null || dates.isEmpty()) {
            return null;
        }
        HashMap<Date, Integer> dateCounts = new HashMap<Date, Integer>();
        for (Date date : dates) {
            dateCounts.put(date, dateCounts.getOrDefault(date, 0) + 1);
        }
        int maxCount = (Integer)Collections.max(dateCounts.values());
        ArrayList<Date> maxDates = new ArrayList<Date>();
        for (Map.Entry entry2 : dateCounts.entrySet()) {
            if ((Integer)entry2.getValue() != maxCount || ((Date)entry2.getKey()).before(currentDate)) continue;
            maxDates.add((Date)entry2.getKey());
        }
        if (maxDates.isEmpty()) {
            return null;
        }
        return (Date)Collections.max(maxDates);
    }
}

