check

#! /usr/bin/env python

"""
Validate Software Carpentry lessons
according to the Markdown template specification described here:
http://software-carpentry.org/blog/2014/10/new-lesson-template-v2.html

Validates the presence of headings, as well as specific sub-nodes.
Contains validators for several kinds of template.

Call at command line with flag -h to see options and usage instructions.
"""
from __future__ import print_function

import argparse
import glob
import hashlib
import logging
import os
import re
import sys

try:
    # Code tested with CommonMark version 0.5.4; API may change
    import CommonMark
except ImportError:
    ERROR_MESSAGE = """This program requires the CommonMark python package.
Install using

    # pip install commonmark

or

    # easy_install commonmark
"""
    print(ERROR_MESSAGE)
    sys.exit(1)

import validation_helpers as vh


class MarkdownValidator(object):
    """Base class for Markdown validation

    Contains basic validation skeleton to be extended for specific page types
    """
    HEADINGS = []  # List of strings containing expected heading text
    WARN_ON_EXTRA_HEADINGS = True  # Warn when other headings are present?

    DOC_HEADERS = {}  # Rows in header section (first few lines of document).

    def __init__(self, filename=None, markdown=None):
        """Perform validation on a Markdown document.

        Validator accepts either the path to a file containing Markdown,
        OR a valid Markdown string. The latter is useful for unit testing."""
        self.filename = filename

        if filename:
            # Expect Markdown files to be in same directory as the input file
            self.markdown_dir = os.path.dirname(filename)
            self.lesson_dir = self.markdown_dir
            with open(filename, 'rU') as f:
                self.markdown = f.read()
        else:
            # Look for linked content in ../pages (relative to this file)
            self.lesson_dir = os.path.abspath(
                os.path.join(os.path.dirname(__file__), os.pardir))

            self.markdown_dir = self.lesson_dir
            self.markdown = markdown

        ast = self._parse_markdown(self.markdown)
        self.ast = vh.CommonMarkHelper(ast)

    def _parse_markdown(self, markdown):
        parser = CommonMark.DocParser()
        ast = parser.parse(markdown)
        return ast

    def _validate_hrs(self):
        """Validate header

        Verify that the header section at top of document
        is bracketed by two horizontal rules"""
        valid = True
        try:
            hr_nodes = [self.ast.children[0], self.ast.children[2]]
        except IndexError:
            logging.error(
                "In {0}: "
                "Document must include header sections".format(self.filename))
            return False

        for hr in hr_nodes:
            if not self.ast.is_hr(hr):
                logging.error(
                    "In {0}: "
                    "Expected --- at line: {1}".format(
                        self.filename, hr.start_line))
                valid = False
        return valid

    def _validate_one_doc_header_row(self, text):
        """Validate a single row of the document header section"""
        label, content = text.split(":", 1)
        if label not in self.DOC_HEADERS:
            logging.warning(
                "In {0}: "
                "Unrecognized label in header section: {1}".format(
                    self.filename, label))
            return False

        validation_function = self.DOC_HEADERS[label]
        validate_header = validation_function(content)
        if not validate_header:
            logging.error(
                "In {0}: "
                "Document header field for label {1} "
                "does not follow expected format".format(self.filename, label))
        return validate_header

    # Methods related to specific validation. Can override specific tests.
    def _validate_doc_headers(self):
        """Validate the document header section.

        Pass only if the header of the document contains the specified
            sections with the expected contents"""

        # Header section should be wrapped in hrs
        has_hrs = self._validate_hrs()

        # Labeled sections in the actual headers should match expected format
        header_node = self.ast.children[1]
        test_headers = [self._validate_one_doc_header_row(s)
                        for s in header_node.strings]

        # Must have all expected header lines, and no others.
        only_headers = (len(header_node.strings) == len(self.DOC_HEADERS))

        # Headings must appear in the order expected
        valid_order = self._validate_section_heading_order()

        return has_hrs and all(test_headers) and only_headers and valid_order

    def _validate_section_heading_order(self, ast_node=None, headings=None):
        """Verify that section headings appear, and in the order expected"""
        # TODO: Refactor into individual tests in the future
        if ast_node is None:
            ast_node = self.ast.data
            headings = self.HEADINGS

        heading_nodes = self.ast.get_section_headings(ast_node)
        # All headings should be exactly level 2
        correct_level = True
        for n in heading_nodes:
            if n.level != 2:
                logging.error(
                    "In {0}: "
                    "Heading at line {1} should be level 2".format(
                        self.filename, n.start_line))
                correct_level = False

        heading_labels = [vh.strip_attrs(n.strings[0]) for n in heading_nodes]

        # Check for missing and extra headings
        missing_headings = [expected_heading for expected_heading in headings
                            if expected_heading not in heading_labels]

        extra_headings = [found_heading for found_heading in heading_labels
                          if found_heading not in headings]

        for h in missing_headings:
            logging.error(
                "In {0}: "
                "Document is missing expected heading: {1}".format(
                    self.filename, h))

        if self.WARN_ON_EXTRA_HEADINGS is True:
            for h in extra_headings:
                logging.error(
                    "In {0}: "
                    "Document contains heading "
                    "not specified in the template: {1}".format(
                        self.filename, h))
            no_extra = (len(extra_headings) == 0)
        else:
            no_extra = True

        # Check that the subset of headings
        # in the template spec matches order in the document
        valid_order = True
        headings_overlap = [h for h in heading_labels if h in headings]
        if len(missing_headings) == 0 and headings_overlap != headings:
            valid_order = False
            logging.error(
                "In {0}: "
                "Document headings do not match "
                "the order specified by the template".format(self.filename))

        return (len(missing_headings) == 0) and \
               valid_order and no_extra and correct_level

    def _validate_one_link(self, link_node):
        """Logic to validate a single external asset (image or link)

        Any local html file being linked was generated as part of the lesson.
        Therefore, file links (.html) must have a Markdown file
            in the expected folder.

        The title of the linked Markdown document should match the link text.

        For other assets (links or images), just verify that a file exists
        """
        dest, link_text = self.ast.get_link_info(link_node)

        if re.match(r"^[\w,\s-]+\.(html?)", dest, re.IGNORECASE):
            # HTML files in same folder are made from Markdown; special tests
            fn = dest.split("#")[0]  # Split anchor name from filename
            expected_md_fn = os.path.splitext(fn)[0] + os.extsep + "md"
            expected_md_path = os.path.join(self.markdown_dir,
                                            expected_md_fn)
            if not os.path.isfile(expected_md_path):
                logging.error(
                    "In {0}: "
                    "The document links to {1}, but could not find "
                    "the expected markdown file {2}".format(
                        self.filename, dest, expected_md_path))
                return False

            # If file exists, parse and validate link text = node title
            with open(expected_md_path, 'rU') as link_dest_file:
                dest_contents = link_dest_file.read()

            dest_ast = self._parse_markdown(dest_contents)
            dest_ast = vh.CommonMarkHelper(dest_ast)
            dest_page_title = dest_ast.get_doc_header_subtitle()

            if dest_page_title != link_text:
                logging.error(
                    "In {0}: "
                    "The linked page {1} exists, but "
                    "the link text '{2}' does not match the "
                    "(sub)title of that page, '{3}'.".format(
                        self.filename, dest,
                        link_text, dest_page_title))
                return False
        elif not re.match(r"^((https?|ftp)://)", dest, re.IGNORECASE)\
                and not re.match(r"^#.*", dest):
            # If not web URL, and not anchor on same page, then
            #  verify that local file exists
            dest_path = os.path.join(self.lesson_dir, dest)
            if not os.path.isfile(dest_path):
                logging.error(
                    "In {0}: "
                    "Could not find the linked asset file "
                    "{1} in {2}. If this is a URL, it must be "
                    "prefixed with http(s):// or ftp://.".format(
                        self.filename, dest, dest_path))
                return False
        else:
            logging.warning(
                "In {0}: "
                "Skipped validation of link {1}".format(self.filename, dest))
        return True

    def _validate_links(self, links_to_skip=()):
        """Validate all references to external content

        This includes links AND images: these are the two types of node that
        CommonMark assigns a .destination property"""
        links = self.ast.find_external_links()

        valid = True
        for link_node in links:
            if link_node.destination not in links_to_skip:
                res = self._validate_one_link(link_node)
                valid = valid and res
        return valid

    def _run_tests(self):
        """
        Let user override the list of tests to be performed.

        Error trapping is handled by the validate() wrapper method.
        """
        tests = [self._validate_doc_headers(),
                 self._validate_section_heading_order(),
                 self._validate_links()]

        return all(tests)

    def validate(self):
        """Perform all required validations. Wrap in exception handler"""
        try:
            return self._run_tests()
        except IndexError:
            logging.error("Document is missing critical sections")
            return False


class IndexPageValidator(MarkdownValidator):
    """Validate the contents of the homepage (index.md)"""
    HEADINGS = ['Topics',
                'Other Resources']

    DOC_HEADERS = {'layout': vh.is_str,
                   'title': vh.is_str}

    def _validate_intro_section(self):
        """Validate the intro section

        It must be a paragraph, followed by blockquoted list of prereqs"""
        intro_block = self.ast.children[3]
        intro_section = self.ast.is_paragraph(intro_block)
        if not intro_section:
            logging.error(
                "In {0}: "
                "Expected paragraph of introductory text at {1}".format(
                    self.filename, intro_block.start_line))

        # Validate the prerequisites block
        prereqs_block = self.ast.get_block_titled("Prerequisites",
                                                  heading_level=2)
        if prereqs_block:
            # Found the expected block; now check contents
            prereqs_tests = self.ast.has_number_children(prereqs_block[0],
                                                         minc=2)
        else:
            prereqs_tests = False

        if prereqs_tests is False:
            logging.error(
                "In {0}: "
                "Intro should contain a blockquoted section with level 2 "
                "title 'Prerequisites'. Section should not be empty.".format(
                    self.filename))
        return intro_section and prereqs_tests

    def _validate_links(self, links_to_skip=('motivation.html',
                                             'reference.html',
                                             'discussion.html',
                                             'instructors.html')):
        return super(IndexPageValidator, self)._validate_links(links_to_skip)

    def _run_tests(self):
        tests = [self._validate_intro_section()]
        parent_tests = super(IndexPageValidator, self)._run_tests()
        return all(tests) and parent_tests


class TopicPageValidator(MarkdownValidator):
    """Validate the Markdown contents of a topic page, eg 01-topicname.md"""
    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str,
                   "minutes": vh.is_numeric}

    # TODO: Write validator for, eg, challenge section
    def _validate_learning_objective(self):
        learn_node = self.ast.get_block_titled("Learning Objectives",
                                               heading_level=2)
        if learn_node:
            # In addition to title, the node must have some content
            node_tests = self.ast.has_number_children(learn_node[0], minc=2)
        else:
            node_tests = False

        if node_tests is False:
            logging.error(
                "In {0}: "
                "Learning Objectives should not be empty.".format(
                    self.filename))

        return node_tests

    def _validate_has_no_headings(self):
        """Check headings

        The top-level document has no headings indicating subtopics.
        The only valid subheadings are nested in blockquote elements"""
        heading_nodes = self.ast.get_section_headings()
        if len(heading_nodes) == 0:
            return True

        logging.error(
            "In {0}: "
            "The topic page should not have sub-headings "
            "outside of special blocks. "
            "If a topic needs sub-headings, "
            "it should be broken into multiple topics.".format(self.filename))
        for n in heading_nodes:
            logging.warning(
                "In {0}: "
                "The following sub-heading should be removed: {1}".format(
                    self.filename, n.strings[0]))
        return False

    def _run_tests(self):
        tests = [self._validate_has_no_headings(),
                 self._validate_learning_objective()]
        parent_tests = super(TopicPageValidator, self)._run_tests()
        return all(tests) and parent_tests


class MotivationPageValidator(MarkdownValidator):
    """Validate motivation.md"""
    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}
    # TODO: How to validate? May be a mix of reveal.js (HTML) + markdown.


class ReferencePageValidator(MarkdownValidator):
    """Validate reference.md"""
    HEADINGS = ["Glossary"]
    WARN_ON_EXTRA_HEADINGS = False

    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}

    def _validate_glossary_entry(self, glossary_entry):
        """Validate glossary entry

        Glossary entry must be formatted in conformance with Pandoc's
        ```definition_lists``` extension.

        That syntax isn't supported by the CommonMark parser, so we identify
        terms manually."""
        glossary_keyword = glossary_entry[0]
        if len(glossary_entry) < 2:
            logging.error(
                    "In {0}:"
                    "Glossary entry '{1}' must have at least two lines- "
                    "a term and a definition.".format(
                        glossary_keyword, self.filename))
            return False

        entry_is_valid = True
        for line_index, line in enumerate(glossary_entry):
            if line_index == 1:
                if not re.match("^:   ", line):
                    logging.error(
                            "In {0}:"
                            "At glossary entry '{1}' "
                            "First line of definition must "
                            "start with ':    '.".format(
                                glossary_keyword, self.filename))
                    entry_is_valid = False
            elif line_index > 1:
                if not re.match("^    ", line):
                    logging.error(
                            "In {0}:"
                            "At glossary entry '{1}' "
                            "Subsequent lines of definition must "
                            "start with '     '.".format(
                                glossary_keyword, self.filename))
                    entry_is_valid = False
        return entry_is_valid

    def _validate_glossary(self):
        """Validate the glossary section.

        Assumes that the glossary is at the end of the file:
            everything after the header. (and there must be a glossary section)

        Verifies that the only things in the glossary are definition items.
        """
        is_glossary_valid = True
        in_glossary = False
        for node in self.ast.children:
            if in_glossary:
                is_glossary_valid = is_glossary_valid and \
                    self._validate_glossary_entry(node.strings)
            elif self.ast.is_heading(node) and "Glossary" in node.strings:
                in_glossary = True

        return is_glossary_valid

    def _run_tests(self):
        tests = [self._validate_glossary()]
        parent_tests = super(ReferencePageValidator, self)._run_tests()
        return all(tests) and parent_tests


class InstructorPageValidator(MarkdownValidator):
    """Simple validator for Instructor's Guide- instructors.md"""
    HEADINGS = ["Legend", "Overall"]
    WARN_ON_EXTRA_HEADINGS = False

    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}


class LicensePageValidator(MarkdownValidator):
    """Validate LICENSE.md: user should not edit this file"""
    def _run_tests(self):
        """Skip the base tests; just check md5 hash"""
        # TODO: This hash is specific to the license for english-language repo
        expected_hash = 'cd5742b6596a1f2f35c602ad43fa24b2'
        m = hashlib.md5()
        try:
            m.update(self.markdown)
        except TypeError:
            # Workaround for hashing in python3
            m.update(self.markdown.encode('utf-8'))

        if m.hexdigest() == expected_hash:
            return True
        else:
            logging.error("The provided license file should not be modified.")
            return False


class DiscussionPageValidator(MarkdownValidator):
    """
    Validate the discussion page (discussion.md).
    Most of the content is free-form.
    """
    WARN_ON_EXTRA_HEADINGS = False
    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}


# Associate lesson template names with validators. This list used by CLI.
#   Dict of {name: (Validator, filename_pattern)}
LESSON_TEMPLATES = {"index": (IndexPageValidator, "^index"),
                    "topic": (TopicPageValidator, "^[0-9]{2}-.*"),
                    "motivation": (MotivationPageValidator, "^motivation"),
                    "reference": (ReferencePageValidator, "^reference"),
                    "instructor": (InstructorPageValidator, "^instructors"),
                    "license": (LicensePageValidator, "^LICENSE"),
                    "discussion": (DiscussionPageValidator, "^discussion")}

# List of files in the lesson directory that should not be validated at all
SKIP_FILES = ("DESIGN.md", "FAQ.md", "LAYOUT.md", "README.md")


def identify_template(filepath):
    """Identify template

    Given the path to a single file,
    identify the appropriate template to use"""
    for template_name, (validator, pattern) in LESSON_TEMPLATES.items():
        if re.search(pattern, os.path.basename(filepath)):
            return template_name

    return None


def validate_single(filepath, template=None):
    """Validate a single Markdown file based on a specified template"""
    if os.path.basename(filepath) in SKIP_FILES:
        # Silently pass certain non-lesson files without validating them
        return True

    template = template or identify_template(filepath)
    if template is None:
        logging.error(
            "Validation failed for {0}: "
            "Could not automatically identify correct template.".format(
                filepath))
        return False

    logging.debug(
        "Beginning validation of {0} using template {1}".format(
            filepath, template))
    validator = LESSON_TEMPLATES[template][0]
    validate_file = validator(filepath)

    res = validate_file.validate()
    if res is True:
        logging.debug("File {0} successfully passed validation".format(
            filepath))
    else:
        logging.debug("File {0} failed validation: "
                      "see error log for details".format(filepath))

    return res


def validate_folder(path, template=None):
    """Validate an entire folder of files"""
    search_str = os.path.join(path, "*.md")  # Find files based on extension
    filename_list = glob.glob(search_str)

    if not filename_list:
        logging.error(
            "No Markdown files were found "
            "in specified directory {0}".format(path))
        return False

    all_valid = True
    for fn in filename_list:
        res = validate_single(fn, template=template)
        all_valid = all_valid and res
    return all_valid


def start_logging(level=logging.INFO):
    """Initialize logging and print messages to console."""
    logging.basicConfig(stream=sys.stdout, level=level)


def command_line():
    """Handle arguments passed in via the command line"""
    parser = argparse.ArgumentParser()
    parser.add_argument("file_or_path",
                        nargs="*",
                        default=[os.getcwd()],
                        help="The individual pathname")

    parser.add_argument('-t', '--template',
                        choices=LESSON_TEMPLATES.keys(),
                        help="The type of template to apply to all file(s). "
                             "If not specified, will auto-identify template.")

    parser.add_argument('-d', '--debug',
                        action='store_true',
                        help="Enable debug information.")

    return parser.parse_args()


def main(parsed_args_obj):
    if parsed_args_obj.debug:
        log_level = "DEBUG"
    else:
        log_level = "WARNING"
    start_logging(log_level)

    template = parsed_args_obj.template

    all_valid = True
    for fn in parsed_args_obj.file_or_path:
        if os.path.isdir(fn):
            res = validate_folder(fn, template=template)
        elif os.path.isfile(fn):
            res = validate_single(fn, template=template)
        else:
            res = False
            logging.error(
                "The specified file or folder {0} does not exist; "
                "could not perform validation".format(fn))

        all_valid = all_valid and res

    if all_valid is True:
        logging.debug("All Markdown files successfully passed validation.")
        sys.exit(0)
    else:
        logging.warning(
            "Some errors were encountered during validation. "
            "See log for details.")
        sys.exit(1)


if __name__ == "__main__":
    parsed_args = command_line()
    main(parsed_args)

    #### Sample of how validator is used directly
    # validator = HomePageValidator('../index.md')
    # print validator.validate()