#!/usr/bin/env python

'''
Validate layout of generated HTML pages.
(We validate the HTML because source may not be in Markdown.)
'''

import sys
import os
import glob
import fnmatch
import re
import yaml
from optparse import OptionParser
from bs4 import BeautifulSoup
from lxml import etree
import dateutil.parser


# Default lesson configuration.
LESSON_CONFIG = '''\
patterns:
  '*.html':
    - lesson_has_title_in_head
    - lesson_has_navbar
    - lesson_has_title_in_body
    - lesson_has_footer
  index.html:
    - lesson_has_prereq
    - lesson_has_syllabus
  '*-*/index.html':
    - lesson_has_objectives
'''

# Default workshop configuration.
WORKSHOP_CONFIG = '''\
patterns:
  'index.html':
    - workshop_check_slug
    - workshop_check_country
    - workshop_check_language
    - workshop_check_humandate
    - workshop_check_humantime
    - workshop_check_startdate
    - workshop_check_enddate
    - workshop_check_latitude_longitude
    - workshop_check_instructors
    - workshop_check_helpers
    - workshop_check_contact
    - workshop_check_eventbrite
    - workshop_check_etherpad
'''


# Regular expression patterns for workshops.
SLUG_PATTERN = r'.+'
EMAIL_PATTERN = r'[^@]+@[^@]+\.[^@]+'
HUMANTIME_PATTERN = r'((0?[1-9]|1[0-2]):[0-5]\d(am|pm)(-|to)(0?[1-9]|1[0-2]):[0-5]\d(am|pm))|((0?\d|1\d|2[0-3]):[0-5]\d(-|to)(0?\d|1\d|2[0-3]):[0-5]\d)'
EVENTBRITE_PATTERN = r'\d{9,10}'
URL_PATTERN = r'https?://.+'

DEFAULT_CONTACT_EMAIL = 'admin@software-carpentry.org'

# Country and language codes.  Note that codes mean different things: 'ar'
# is 'Arabic' as a language but 'Argentina' as a country.

ISO_COUNTRY = [
    'ad', 'ae', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', 'as',
    'at', 'au', 'aw', 'ax', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh',
    'bi', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', 'by', 'bz',
    'ca', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', 'cm', 'cn', 'co',
    'cr', 'cu', 'cv', 'cx', 'cy', 'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz',
    'ec', 'ee', 'eg', 'eh', 'er', 'es', 'et', 'eu', 'fi', 'fj', 'fk', 'fm',
    'fo', 'fr', 'ga', 'gb', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm',
    'gn', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 'hk', 'hm', 'hn',
    'hr', 'ht', 'hu', 'id', 'ie', 'il', 'im', 'in', 'io', 'iq', 'ir', 'is',
    'it', 'je', 'jm', 'jo', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp',
    'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt',
    'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml', 'mm',
    'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my',
    'mz', 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu',
    'nz', 'om', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr',
    'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb',
    'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so',
    'sr', 'st', 'sv', 'sy', 'sz', 'tc', 'td', 'tf', 'tg', 'th', 'tj', 'tk',
    'tl', 'tm', 'tn', 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'um',
    'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws',
    'ye', 'yt', 'za', 'zm', 'zw'
]

ISO_LANGUAGE = [
    'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az',
    'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce',
    'ch', 'co', 'cr', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee',
    'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr',
    'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr',
    'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'io', 'is',
    'it', 'iu', 'ja', 'jv', 'ka', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn',
    'ko', 'kr', 'ks', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lg', 'li', 'ln',
    'lo', 'lt', 'lu', 'lv', 'mg', 'mh', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms',
    'mt', 'my', 'na', 'nb', 'nd', 'ne', 'ng', 'nl', 'nn', 'no', 'nr', 'nv',
    'ny', 'oc', 'oj', 'om', 'or', 'os', 'pa', 'pi', 'pl', 'ps', 'pt', 'qu',
    'rm', 'rn', 'ro', 'ru', 'rw', 'sa', 'sc', 'sd', 'se', 'sg', 'si', 'sk',
    'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'ta',
    'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tr', 'ts', 'tt', 'tw',
    'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'vo', 'wa', 'wo', 'xh', 'yi',
    'yo', 'za', 'zh', 'zu'
]


# Record all the rules.
RULES = {}
def rule(fn):
    RULES[fn.__name__] = fn
    return fn


# Accumulate error messages.
MESSAGES = []

def main():
    '''Main driver: check all files with all rules that apply.'''

    args = parse_args()
    read_config(args)
    docs = read_all_docs(args.source_dir)
    _require(docs, 'No source files found in {0}'.format(args.source_dir))
    all_filenames = docs.keys()
    for filename in all_filenames:
        if args.verbose > 0:
            print(filename, '...', file=sys.stderr)
        for pattern in args.patterns:
            full_pattern = os.path.join(args.source_dir, pattern)
            if fnmatch.fnmatch(filename, full_pattern):
                for rule in args.patterns[pattern]:
                    if args.verbose > 1:
                        print('...', rule, file=sys.stderr)
                    RULES[rule](filename, docs[filename])
    for m in MESSAGES:
        print(m)


def parse_args():
    '''Parse command-line arguments.'''

    parser = OptionParser()
    parser.add_option('-c', '--config',
                      default=None,
                      dest='config_file',
                      help='configuration file')
    parser.add_option('-l', '--lesson',
                      default=False,
                      action='store_true',
                      dest='check_lesson',
                      help='check a lesson')
    parser.add_option('-s', '--source',
                      default='_site',
                      dest='source_dir',
                      help='source directory')
    parser.add_option('-v', '--verbose',
                      default=0,
                      action='count',
                      dest='verbose',
                      help='report actions')
    parser.add_option('-w', '--workshop',
                      default=False,
                      action='store_true',
                      dest='check_workshop',
                      help='check a workshop')

    args, extras = parser.parse_args()

    _require(not extras, 'Unexpected trailing command-line arguments "{0}"'.format(extras))
    _require(args.check_lesson != args.check_workshop, 'Must have exactly one of -l/-w')

    return args


def read_config(args):
    '''
    Read configuration file.
    '''

    if args.config_file:
        with open(args.config_file, 'r') as reader:
            args.config = yaml.load(reader)
    elif args.check_lesson:
        args.config = yaml.load(LESSON_CONFIG)
    elif args.check_workshop:
        args.config = yaml.load(WORKSHOP_CONFIG)
    else:
        assert False, 'Do not know what configuration to load'

    args.patterns = args.config['patterns']


def read_all_docs(source_dir):
    '''
    Read all HTML pages under the source directory.
    Returns a dictionary of (path, doc).
    '''

    pattern = os.path.join(source_dir, '**/*.html')
    result = {}
    for path in glob.iglob(pattern, recursive=True):
        try:
            with open(path, 'r') as reader:
                raw = reader.read().replace('<!doctype html>\n', '')
                soup = BeautifulSoup(raw, 'html.parser').prettify()
                doc = etree.fromstring(soup)
                result[path] = doc
        except IOError as e:
            print('Unable to open {0}: {1}'.format(path, e), file=sys.stderr)
            sys.exit(1)

    return result


@rule
def lesson_has_footer(filename, doc):
    '''Document has footer element.'''

    _check_one_element(filename, doc, 'footers', '//footer')


@rule
def lesson_has_navbar(filename, doc):
    '''Document has header element.'''

    _check_one_element(filename, doc, 'div navbar', '//div[@class="navbar-header"]')


@rule
def lesson_has_objectives(filename, doc):
    '''Episode has objectives.'''

    _check_one_element(filename, doc, 'objectives div', '//blockquote[@class="objectives"]')


@rule
def lesson_has_prereq(filename, doc):
    '''Index page has prerequisites block.'''

    _check_one_element(filename, doc, 'prerequisites blockquote', '//blockquote[@class="prereq"]')


@rule
def lesson_has_syllabus(filename, doc):
    '''Index page has syllabus.'''

    _check_one_element(filename, doc, 'syllabus', '//div[@class="syllabus"]')
    _check_one_element(filename, doc, 'syllabus title', '//div[@class="syllabus"]/h2')
    _check_one_element(filename, doc, 'syllabus table', '//div[@class="syllabus"]/table')


@rule
def lesson_has_title_in_head(filename, doc):
    '''Document has a title in the head.'''

    _check_one_element(filename, doc, 'title in head', '//head//title')


@rule
def lesson_has_title_in_body(filename, doc):
    '''Document has a title in the body.'''

    _check_one_element(filename, doc, 'title in body', '//body//h1[@class="maintitle"]')


@rule
def workshop_check_slug(filename, doc):
    content = _check_meta(filename, doc, 'slug')
    _check_regexp(SLUG_PATTERN, content, 'invalid slug')


@rule
def workshop_check_country(filename, doc):
    '''"country" must be a lowercase ISO-3166 two-letter code.'''

    country = _check_meta(filename, doc, 'country')
    _check(country in ISO_COUNTRY, 'Unknown country')


@rule
def workshop_check_language(filename, doc):
    '''"language" must be a lowercase ISO-639 two-letter code.'''

    language = _check_meta(filename, doc, 'language')
    _check(language in ISO_LANGUAGE, 'Unknown language')


@rule
def workshop_check_humandate(filename, doc):
    '''"humandate" must be a human-readable date with a 3-letter month and
    4-digit year.  Examples include "Feb 18-20, 2025" and "Feb 18 and
    20, 2025".  It may be in languages other than English, but the
    month name should be kept short to aid formatting of the main
    Software Carpentry web site.'''

    humandate = _check_meta(filename, doc, 'humandate')
    if _check(',' in humandate, 'Require comma in human date'):
        month_dates, year = humandate.split(",")

        # The first three characters of month_dates are not empty
        month = month_dates[:3]
        _check(not any(char == " " for char in month), 'Cannot be spaces in month')

        # But the fourth character is empty ("February" is illegal)
        require(month_dates[3] == " ", 'Month names must be three letters long')

        # Year must contain only digits.
        _check_regexp('\d+', year, 'Year must be only digits')


@rule
def workshop_check_humantime(filename, doc):
    '''"humantime" is a human-readable start and end time for the workshop,
    such as "09:00 - 16:00".'''

    time = _check_meta(filename, doc, 'humantime')
    if time:
        _check_regexp(HUMANTIME_PATTERN, time.replace(" ", ""), 'Badly-formatted human time')


@rule
def workshop_check_startdate(filename, doc):
    '''"startdate" must be machine-readable start date for the workshop,
    and must be in YYYY-MM-DD format, e.g., "2015-07-01".'''

    startdate = _check_meta(filename, doc, 'startdate')
    try:
        startdate = dateutil.parser.parse(startdate)
    except ValueError as e:
        _check(False, 'Badly-formatted start date')


@rule
def workshop_check_enddate(filename, doc):
    '''"enddate" must be machine-readable end date for the workshop,
    and must be in YYYY-MM-DD format, e.g., "2015-07-01".'''

    enddate = _check_meta(filename, doc, 'enddate')
    try:
        enddate = dateutil.parser.parse(enddate)
    except ValueError as e:
        _check(False, 'Badly-formatted end date')


@rule
def workshop_check_latitude_longitude(filename, doc):
    '''"latlng" must be a valid latitude and longitude represented as two
    floating-point numbers separated by a comma.'''

    latlng = _check_meta(filename, doc, 'latlng')
    try:
        lat, lng = latlng.split(',')
        lat = float(lat)
        long = float(lng)
        _check((-90.0 <= lat <= 90.0) and (-180.0 <= long <= 180.0),
               'Invalid numeric values for latitude/longitude')
    except ValueError:
        _check(False, 'Unable to parse lat/long')


@rule
def workshop_check_instructors(filename, doc):
    '''"instructor" must be a non-empty comma-separated list of quoted names,
    e.g. ['First name', 'Second name', ...'].  Do not use "TBD" or other
    placeholders.'''

    pass # FIXME


@rule
def workshop_check_helpers(filename, doc):
    '''"helper" must be a comma-separated list of quoted names,
    e.g. ['First name', 'Second name', ...'].  The list may be empty.  Do
    not use "TBD" or other placeholders.'''

    pass # FIXME


@rule
def workshop_check_contact(filename, doc):
    '''"contact" must be a valid email address consisting of characters, a
    @, and more characters.  It should not be the default contact
    email address "admin@software-carpentry.org".'''

    contact = _check_meta(filename, doc, 'contact')
    _check_regexp(EMAIL_PATTERN, contact, 'Invalid contact email')
    _check(contact != DEFAULT_CONTACT_EMAIL, 'Cannot use default contact email')


@rule
def workshop_check_eventbrite(filename, doc):
    '''The Eventbrite registration key must be 9 or more digits.'''

    eventbrite = _check_meta(filename, doc, 'eventbrite')
    _check_regexp(EVENTBRITE_PATTERN, eventbrite, 'Eventbrite key must be 9 or more digits')


@rule
def workshop_check_etherpad(filename, doc):
    '''Etherpad must be a valid URL.'''

    etherpad = _check_meta(filename, doc, 'etherpad')
    _check_regexp(URL_PATTERN, etherpad, 'Etherpad address must be a valid URL')


def _check_meta(filename, doc, metaname):
    '''Check one metadata attribute.'''

    xpath = '//html/head/meta[@name="{0}"]'.format(metaname)
    content = _check_one_element(filename, doc, metaname, xpath, attribute='content')
    return content


def _check_one_element(filename, doc, rulename, xpath, attribute=None):
    '''Check that an equality holds.'''

    result = None
    actual = doc.xpath(xpath)
    if len(actual) != 1:
        MESSAGES.append('In {0}, checking {1}: expected 1 match, got {2}'.format(filename, rulename, len(actual)))
    elif attribute is not None:
        result = actual[0].attrib.get(attribute, None)
    return result


def _check_regexp(pattern, value, message):
    '''Check a regular expression match if the value is a string.'''

    _check(value is not None, message + ': value is None') and \
    _check(re.match(pattern, value), message)


def _check(condition, message):
    '''Record error message if condition not met, returning condition for chaining.'''

    if not condition:
        MESSAGES.append(message)
    return condition

def _require(condition, message):
    '''Fail if condition not met.'''

    if not condition:
        print(message, file=sys.stderr)
        sys.exit(1)


if __name__ == '__main__':
    main()
