From 0d5c61e0ff0db9caed278f2378b38ebb8f37c859 Mon Sep 17 00:00:00 2001
From: Andy Boughton <abought@gmail.com>
Date: Sun, 9 Nov 2014 21:54:01 -0500
Subject: [PATCH] Initial commit of template validator skeleton (from local
 repo)

Add validation methods and refactor ast helpers to separate class

Add tiny logging message; prep for first push to github fork

Start adding tests

Fix errors revealed by unit tests

Update validator to check heading order; tests passing

Validation of links and changes to heading validation

Tiny comment cleanup; push to share version with swc

Add some descriptive comments and create minimal validator example (for instructors.md) to demonstrate subclassing

Split helpers to separate file and update for newest index.md template

Grooming

Bring up to spec with newest versions of lesson templates. Add tests to ensure all templates pass validation.

Saner default assumption about markdown file locations

Add skeletons for remaining validators. Incorporate ranier's code for batch validation.

Code changes for single-file python 2 and 3 compatibility.

Refactor out template validation logic in prep for cmd line rework

Proof of concept refactor to command line sig for @rgaia-cs (issue #34)

With no path provided, default to curdir, not a hard-coded default

Implement license and discussion validators. Some test cleanup.

Validate file links against page titles. Add line length check.

Fix python3 hash error

Rename validator to check

Conflicts:
	tools/validate_markdown_template.py

Enforce section heading levels + pep8 cleanup

Improved link validation for non-html files

Validate images as well as links

Verify that all headings are exactly level 2

Link text validation based on document header "subtitle" instead of "title".

Adjustments based on run against the SQL repo.

Fix links to topics

By default only show erros and warnings to user

- Add another argument for CLI to enable debug log level
- Change some log message to debug instead of info
- Add file information for errors and warnings

Fix import at check

From [PEP8](https://www.python.org/dev/peps/pep-0008/#imports)

> Imports should usually be on separate lines, e.g.:
>
> Yes: import os
>      import sys
>
> No:  import sys, os

Heading bugfixes, update failing tests, and cleanup. fix swcarpentry/lesson-template#39, swcarpentry/lesson-template#42 .

Add validation of glossary

Slight clarification to log messages.

Skip validate some links in index

Fix super() call for python2; avoid mutable default argument.

Lesson template validator.
---
 pages/index.md              |   4 +-
 tools/check                 | 813 ++++++++++++++++++++++++++++--------
 tools/test_check.py         | 366 ++++++++++++++++
 tools/validation_helpers.py | 202 +++++++++
 4 files changed, 1198 insertions(+), 187 deletions(-)
 create mode 100644 tools/test_check.py
 create mode 100644 tools/validation_helpers.py

diff --git a/pages/index.md b/pages/index.md
index 55a249a..f113f1a 100644
--- a/pages/index.md
+++ b/pages/index.md
@@ -11,8 +11,8 @@ Paragraph of introductory material.
 
 ## Topics
 
-1.  [Topic Title 1](01-one.html)
-2.  [Topic Title 2](02-two.html)
+1.  [Topic Title One](01-one.html)
+2.  [Topic Title Two](02-two.html)
 
 ## Other Resources
 
diff --git a/tools/check b/tools/check
index 5457d73..12a402c 100755
--- a/tools/check
+++ b/tools/check
@@ -1,213 +1,656 @@
-#!/usr/bin/python
-#
-# Software Carpentry Lesson Validator
-#
-# Check for errors in lessons built using the Software Carpentry template
-# found at http://github.com/swcarpentry/lesson-template.
-#
-# Usage:
-#
-#     $ tools/check
+#! /usr/bin/env python
 
-import sys
+"""
+Validate Software Carpentry lessons
+according to the Markdown template specification described here:
+http://software-carpentry.org/blog/2014/10/new-lesson-template-v2.html
+
+Validates the presence of headings, as well as specific sub-nodes.
+Contains validators for several kinds of template.
+
+Call at command line with flag -h to see options and usage instructions.
+"""
+from __future__ import print_function
+
+import argparse
+import glob
+import hashlib
+import logging
 import os
 import re
-import yaml
+import sys
 
-#----------------------------------------
-# Error reporting.
+try:
+    # Code tested with CommonMark version 0.5.4; API may change
+    import CommonMark
+except ImportError:
+    ERROR_MESSAGE = """This program requires the CommonMark python package.
+Install using
 
-def report_error(file_path, line_number, line, error_message):
-    """
-    Print information about general error.
-    """
-    ERR_MSG = "Error at line {} of {}:\n\t{}\n{}"
-    print(ERR_MSG.format(line_number, file_path, line, error_message))
+    # pip install commonmark
 
-def report_missing(present, file_path, missing_element):
-    """
-    Print information about missing element.
-    """
-    ERR_MSG = "Error on {}: missing {}"
-    if not present:
-        print(ERR_MSG.format(file_path, missing_element))
+or
 
-def report_missing_metadata(missing_element):
-    """
-    Print information about missing metadata at YAML header.
-    """
-    ERR_MSG = "Error on YAML header: missing {}"
-    print(ERR_MSG.format(missing_element))
+    # easy_install commonmark
+"""
+    print(ERROR_MESSAGE)
+    sys.exit(1)
 
-def report_broken_link(file_path, line_number, link):
-    """
-    Print information about broken link.
-    """
-    ERR_MSG = "Broken link at line {} of {}:\n\tCan't find {}."
-    print(ERR_MSG.format(line_number, file_path, link))
+import validation_helpers as vh
 
-#----------------------------------------
-# Checking.
 
-def check_yaml(metadata):
-    """
-    Check if all metadata are present at YAML header.
-    """
-    METADATA_REQUIRED = {"layout", "title", "minutes"}
-    for key in METADATA_REQUIRED - set(metadata.keys()):
-        report_missing_metadata(key)
+class MarkdownValidator(object):
+    """Base class for Markdown validation
 
-# TODO: Implement check_lesson
-def check_lesson(file_path):
+    Contains basic validation skeleton to be extended for specific page types
     """
-    Checks the file ``pages/[0-9]{2}-.*.md`` for:
-
-    - "layout: topic" in YAML header
-    - "title" as keyword in YAML header
-    - line "> ## Learning Objectives {.objectives}" after YAML header
-    - items in learning objectives begin with "*"
-    - items in learning objective following four-space indentation rule
-    - code samples be of type input, error, output, python, shell, r, matlab, or sql
-    - callout box style
-    - challenge box style
-    """
-    pass
+    HEADINGS = []  # List of strings containing expected heading text
+    WARN_ON_EXTRA_HEADINGS = True  # Warn when other headings are present?
 
-# TODO: Implement check_discussion
-def check_discussion(file_path):
-    """
-    Checks the file ``pages/discussion.md`` for:
+    DOC_HEADERS = {}  # Rows in header section (first few lines of document).
 
-    FIXME: tell what need to check.
-    """
-    pass
+    def __init__(self, filename=None, markdown=None):
+        """Perform validation on a Markdown document.
 
-# TODO: Complete implementation of check_index
-# TODO: break check_index into pieces -- it's too long.
-def check_index(file_path):
-    """
-    Checks the file ``pages/index.md`` for:
-
-    - "layout: lesson" in YAML header
-    - "title" as keyword in YAML header
-    - introductory paragraph(s) right after YAML header
-    - line with "> ## Prerequisites"
-    - non-empty prerequisites
-    - title line with "## Topics"
-    - items at topic list begin with "*"
-    - items in topic list follow four-space indentation rule
-    - links at topic list are valid
-    - line with "## Other Resources"
-    - items at other resources list begin with "*"
-    - link at other resources list are valid
-    """
-    # State variables
-    in_yaml = False
-    yaml_metadata = []
-    has_prerequisites = False
-    has_topics = False
-    has_other_resources = False
-
-    # Load file and process it
-    with open(file_path, "r") as lines:
-        for line_number, line in enumerate(lines):
-            if re.match("---", line): # what if there are multiple YAML blocks??
-                in_yaml = not in_yaml
-            elif in_yaml:
-                yaml_metadata.append(line)
-            elif re.match("> ## Prerequisites", line): # check this in the Markdown or in the generated HTML?
-                has_prerequisites = True
-            elif re.match("## Topics", line): # as above?
-                has_topics = True
-            elif re.match("## Other Resources", line): # as above
-                has_other_resources = True
-            else:
-                ## Push this check into another function - this one is getting too long.
-                # Check if local links are valid
-                matches = re.search("\[.*\]\((?P<link>.*)\)", line)
-                if matches and not matches.group("link").startswith("http"):
-                    link = os.path.join(os.path.dirname(file_path), matches.group("link"))
-                    if link.endswith(".html"):
-                        link = link.replace("html", "md") # NO: what about "03-html-editing.html" ?
-                    if not os.path.exists(link):
-                        report_broken_link(file_path, line_number, link)
-
-    ## Again, this function is too long - break it into sub-functions.
-    # Check YAML
-    yaml_metadata = yaml.load("\n".join(yaml_metadata))
-    check_yaml(yaml_metadata)
-
-    # Check sections
-    ## Note the refactoring: replaces three conditionals with one.
-    report_missing(has_prerequisites, file_path, "Prerequisites")
-    report_missing(has_topics, file_path, "Topics")
-    report_missing(has_other_resources, file_path, "Other Resources")
-
-# TODO Implement check_intructors
-def check_intructors(file_path):
-    """
-    Checks the file ``pages/instructors.md`` for:
+        Validator accepts either the path to a file containing Markdown,
+        OR a valid Markdown string. The latter is useful for unit testing."""
+        self.filename = filename
 
-    - "title: Instructor"s Guide" in YAML header
-    - line with "## Overall"
-    - line with "## General Points"
-    - lines with topics titles begin with "## "
-    - points begin with "*" and following four space rules.
-    """
-    pass
+        if filename:
+            # Expect Markdown files to be in same directory as the input file
+            self.markdown_dir = os.path.dirname(filename)
+            self.lesson_dir = os.path.abspath(  # Parent directory of lesson
+                os.path.join(self.markdown_dir, os.pardir))
+            with open(filename, 'rU') as f:
+                self.markdown = f.read()
+        else:
+            # Look for linked content in ../pages (relative to this file)
+            self.lesson_dir = os.path.abspath(
+                os.path.join(os.path.dirname(__file__), os.pardir))
 
-# TODO Implement check_motivation
-def check_motivation(file_path):
-    """
-    Checks the file ``pages/motivation.md``.
+            self.markdown_dir = os.path.join(self.lesson_dir, "pages")
+            self.markdown = markdown
 
-    FIXME: tell what need to check.
-    """
-    pass
+        ast = self._parse_markdown(self.markdown)
+        self.ast = vh.CommonMarkHelper(ast)
 
-# TODO Implement check_reference
-def check_reference(file_path):
-    """
-    Checks the file ``pages/reference.md`` for:
+    def _parse_markdown(self, markdown):
+        parser = CommonMark.DocParser()
+        ast = parser.parse(markdown)
+        return ast
 
-    -   ``layout: reference`` in YAML header
-    -   line with "## Glossary"
-    -   words definitions after at the "Glossary" as::
+    def _validate_hrs(self):
+        """Validate header
 
-        > **Key Word 1**: the definition
-        > relevant to the lesson.
-    """
-    pass
+        Verify that the header section at top of document
+        is bracketed by two horizontal rules"""
+        valid = True
+        try:
+            hr_nodes = [self.ast.children[0], self.ast.children[2]]
+        except IndexError:
+            logging.error(
+                "In {0}: "
+                "Document must include header sections".format(self.filename))
+            return False
 
-def check_file(file_path):
-    """
-    Call the correctly check function based on the name of the file.
+        for hr in hr_nodes:
+            if not self.ast.is_hr(hr):
+                logging.error(
+                    "In {0}: "
+                    "Expected --- at line: {1}".format(
+                        self.filename, hr.start_line))
+                valid = False
+        return valid
+
+    def _validate_one_doc_header_row(self, text):
+        """Validate a single row of the document header section"""
+        label, content = text.split(":", 1)
+        if label not in self.DOC_HEADERS:
+            logging.warning(
+                "In {0}: "
+                "Unrecognized label in header section: {1}".format(
+                    self.filename, label))
+            return False
+
+        validation_function = self.DOC_HEADERS[label]
+        validate_header = validation_function(content)
+        if not validate_header:
+            logging.error(
+                "In {0}: "
+                "Document header field for label {1} "
+                "does not follow expected format".format(self.filename, label))
+        return validate_header
+
+    # Methods related to specific validation. Can override specific tests.
+    def _validate_doc_headers(self):
+        """Validate the document header section.
+
+        Pass only if the header of the document contains the specified
+            sections with the expected contents"""
+
+        # Header section should be wrapped in hrs
+        has_hrs = self._validate_hrs()
+
+        # Labeled sections in the actual headers should match expected format
+        header_node = self.ast.children[1]
+        test_headers = [self._validate_one_doc_header_row(s)
+                        for s in header_node.strings]
+
+        # Must have all expected header lines, and no others.
+        only_headers = (len(header_node.strings) == len(self.DOC_HEADERS))
+
+        # Headings must appear in the order expected
+        valid_order = self._validate_section_heading_order()
+
+        return has_hrs and all(test_headers) and only_headers and valid_order
+
+    def _validate_section_heading_order(self, ast_node=None, headings=None):
+        """Verify that section headings appear, and in the order expected"""
+        # TODO: Refactor into individual tests in the future
+        if ast_node is None:
+            ast_node = self.ast.data
+            headings = self.HEADINGS
+
+        heading_nodes = self.ast.get_section_headings(ast_node)
+        # All headings should be exactly level 2
+        correct_level = True
+        for n in heading_nodes:
+            if n.level != 2:
+                logging.error(
+                    "In {0}: "
+                    "Heading at line {1} should be level 2".format(
+                        self.filename, n.start_line))
+                correct_level = False
+
+        heading_labels = [vh.strip_attrs(n.strings[0]) for n in heading_nodes]
+
+        # Check for missing and extra headings
+        missing_headings = [expected_heading for expected_heading in headings
+                            if expected_heading not in heading_labels]
+
+        extra_headings = [found_heading for found_heading in heading_labels
+                          if found_heading not in headings]
+
+        for h in missing_headings:
+            logging.error(
+                "In {0}: "
+                "Document is missing expected heading: {1}".format(
+                    self.filename, h))
+
+        if self.WARN_ON_EXTRA_HEADINGS is True:
+            for h in extra_headings:
+                logging.error(
+                    "In {0}: "
+                    "Document contains heading "
+                    "not specified in the template: {1}".format(
+                        self.filename, h))
+            no_extra = (len(extra_headings) == 0)
+        else:
+            no_extra = True
+
+        # Check that the subset of headings
+        # in the template spec matches order in the document
+        valid_order = True
+        headings_overlap = [h for h in heading_labels if h in headings]
+        if len(missing_headings) == 0 and headings_overlap != headings:
+            valid_order = False
+            logging.error(
+                "In {0}: "
+                "Document headings do not match "
+                "the order specified by the template".format(self.filename))
+
+        return (len(missing_headings) == 0) and \
+               valid_order and no_extra and correct_level
+
+    def _validate_one_link(self, link_node):
+        """Logic to validate a single external asset (image or link)
+
+        Any local html file being linked was generated as part of the lesson.
+        Therefore, file links (.html) must have a Markdown file
+            in the expected folder.
+
+        The title of the linked Markdown document should match the link text.
+
+        For other assets (links or images), just verify that a file exists
+        """
+        dest, link_text = self.ast.get_link_info(link_node)
+
+        if re.match(r"^[\w,\s-]+\.(html?)", dest, re.IGNORECASE):
+            # HTML files in same folder are made from Markdown; special tests
+            expected_md_fn = os.path.splitext(dest)[0] + os.extsep + "md"
+            expected_md_path = os.path.join(self.markdown_dir,
+                                            expected_md_fn)
+            if not os.path.isfile(expected_md_path):
+                logging.error(
+                    "In {0}: "
+                    "The document links to {1}, but could not find "
+                    "the expected markdown file {2}".format(
+                        self.filename, dest, expected_md_path))
+                return False
+
+            # If file exists, parse and validate link text = node title
+            with open(expected_md_path, 'rU') as link_dest_file:
+                dest_contents = link_dest_file.read()
+
+            dest_ast = self._parse_markdown(dest_contents)
+            dest_ast = vh.CommonMarkHelper(dest_ast)
+            dest_page_title = dest_ast.get_doc_header_subtitle()
+
+            if dest_page_title != link_text:
+                logging.error(
+                    "In {0}: "
+                    "The linked page {1} exists, but "
+                    "the link text '{2}' does not match the "
+                    "(sub)title of that page, '{3}'.".format(
+                        self.filename, dest,
+                        link_text, dest_page_title))
+                return False
+        elif not re.match(r"^((https?|ftp)://)", dest, re.IGNORECASE)\
+                and not re.match(r"^#.*", dest):
+            # If not web URL, and not anchor on same page, then
+            #  verify that local file exists
+            dest_path = os.path.join(self.lesson_dir, dest)
+            if not os.path.isfile(dest_path):
+                logging.error(
+                    "In {0}: "
+                    "Could not find the linked asset file "
+                    "{1} in {2}. If this is a URL, it must be "
+                    "prefixed with http(s):// or ftp://.".format(
+                        self.filename, dest, dest_path))
+                return False
+        else:
+            logging.warning(
+                "In {0}: "
+                "Skipped validation of link {1}".format(self.filename, dest))
+        return True
+
+    def _validate_links(self, links_to_skip=()):
+        """Validate all references to external content
+
+        This includes links AND images: these are the two types of node that
+        CommonMark assigns a .destination property"""
+        links = self.ast.find_external_links()
+
+        valid = True
+        for link_node in links:
+            if link_node.destination not in links_to_skip:
+                res = self._validate_one_link(link_node)
+                valid = valid and res
+        return valid
+
+    def _run_tests(self):
+        """
+        Let user override the list of tests to be performed.
+
+        Error trapping is handled by the validate() wrapper method.
+        """
+        tests = [self._validate_doc_headers(),
+                 self._validate_section_heading_order(),
+                 self._validate_links()]
+
+        return all(tests)
+
+    def validate(self):
+        """Perform all required validations. Wrap in exception handler"""
+        try:
+            return self._run_tests()
+        except IndexError:
+            logging.error("Document is missing critical sections")
+            return False
+
+
+class IndexPageValidator(MarkdownValidator):
+    """Validate the contents of the homepage (index.md)"""
+    HEADINGS = ['Topics',
+                'Other Resources']
+
+    DOC_HEADERS = {'layout': vh.is_str,
+                   'title': vh.is_str}
+
+    def _validate_intro_section(self):
+        """Validate the intro section
+
+        It must be a paragraph, followed by blockquoted list of prereqs"""
+        intro_block = self.ast.children[3]
+        intro_section = self.ast.is_paragraph(intro_block)
+        if not intro_section:
+            logging.error(
+                "In {0}: "
+                "Expected paragraph of introductory text at {1}".format(
+                    self.filename, intro_block.start_line))
+
+        # Validate the prerequisites block
+        prereqs_block = self.ast.get_block_titled("Prerequisites",
+                                                  heading_level=2)
+        if prereqs_block:
+            # Found the expected block; now check contents
+            prereqs_tests = self.ast.has_number_children(prereqs_block[0],
+                                                         minc=2)
+        else:
+            prereqs_tests = False
+
+        if prereqs_tests is False:
+            logging.error(
+                "In {0}: "
+                "Intro should contain a blockquoted section with level 2 "
+                "title 'Prerequisites'. Section should not be empty.".format(
+                    self.filename))
+        return intro_section and prereqs_tests
+
+    def _validate_links(self, links_to_skip=('motivation.html',
+                                             'reference.html',
+                                             'discussion.html',
+                                             'instructors.html')):
+        return super(IndexPageValidator, self)._validate_links(links_to_skip)
+
+    def _run_tests(self):
+        tests = [self._validate_intro_section()]
+        parent_tests = super(IndexPageValidator, self)._run_tests()
+        return all(tests) and parent_tests
+
+
+class TopicPageValidator(MarkdownValidator):
+    """Validate the Markdown contents of a topic page, eg 01-topicname.md"""
+    DOC_HEADERS = {"layout": vh.is_str,
+                   "title": vh.is_str,
+                   "subtitle": vh.is_str,
+                   "minutes": vh.is_numeric}
+
+    # TODO: Write validator for, eg, challenge section
+    def _validate_learning_objective(self):
+        learn_node = self.ast.get_block_titled("Learning Objectives",
+                                               heading_level=2)
+        if learn_node:
+            # In addition to title, the node must have some content
+            node_tests = self.ast.has_number_children(learn_node[0], minc=2)
+        else:
+            node_tests = False
+
+        if node_tests is False:
+            logging.error(
+                "In {0}: "
+                "Learning Objectives should not be empty.".format(
+                    self.filename))
+
+        return node_tests
+
+    def _validate_has_no_headings(self):
+        """Check headings
+
+        The top-level document has no headings indicating subtopics.
+        The only valid subheadings are nested in blockquote elements"""
+        heading_nodes = self.ast.get_section_headings()
+        if len(heading_nodes) == 0:
+            return True
+
+        logging.error(
+            "In {0}: "
+            "The topic page should not have sub-headings "
+            "outside of special blocks. "
+            "If a topic needs sub-headings, "
+            "it should be broken into multiple topics.".format(self.filename))
+        for n in heading_nodes:
+            logging.warning(
+                "In {0}: "
+                "The following sub-heading should be removed: {1}".format(
+                    self.filename, n.strings[0]))
+        return False
+
+    def _run_tests(self):
+        tests = [self._validate_has_no_headings(),
+                 self._validate_learning_objective()]
+        parent_tests = super(TopicPageValidator, self)._run_tests()
+        return all(tests) and parent_tests
+
+
+class MotivationPageValidator(MarkdownValidator):
+    """Validate motivation.md"""
+    DOC_HEADERS = {"layout": vh.is_str,
+                   "title": vh.is_str}
+    # TODO: How to validate? May be a mix of reveal.js (HTML) + markdown.
+
+
+class ReferencePageValidator(MarkdownValidator):
+    """Validate reference.md"""
+    HEADINGS = ["Glossary"]
+    WARN_ON_EXTRA_HEADINGS = False
+
+    DOC_HEADERS = {"layout": vh.is_str,
+                   "title": vh.is_str,
+                   "subtitle": vh.is_str}
+
+    def _validate_glossary_entry(self, glossary_entry):
+        """Validate glossary entry
+
+        Glossary entry must be formatted in conformance with Pandoc's
+        ```definition_lists``` extension.
+
+        That syntax isn't supported by the CommonMark parser, so we identify
+         terms manually."""
+        if len(glossary_entry) < 2:
+            logging.error(
+                    "In {0}:"
+                    "Glossary entry must have at least two lines- "
+                    "a term and a definition.".format(
+                        self.filename))
+            return False
+
+        entry_is_valid = True
+        for line_index, line in enumerate(glossary_entry):
+            if line_index == 1:
+                if not re.match("^:   ", line):
+                    logging.error(
+                            "In {0}:"
+                            "First line of definition must "
+                            "start with ':    '.".format(
+                                self.filename))
+                    entry_is_valid = False
+            elif line_index > 1:
+                if not re.match("^    ", line):
+                    logging.error(
+                            "In {0}:"
+                            "Subsequent lines of definition must "
+                            "start with '     '.".format(
+                                self.filename))
+                    entry_is_valid = False
+        return entry_is_valid
+
+    def _validate_glossary(self):
+        """Validate the glossary section.
+
+        Assumes that the glossary is at the end of the file:
+            everything after the header. (and there must be a glossary section)
+
+        Verifies that the only things in the glossary are definition items.
+        """
+        is_glossary_valid = True
+        in_glossary = False
+        for node in self.ast.children:
+            if in_glossary:
+                is_glossary_valid = is_glossary_valid and \
+                    self._validate_glossary_entry(node.strings)
+            elif self.ast.is_heading(node) and "Glossary" in node.strings:
+                in_glossary = True
+
+        return is_glossary_valid
+
+    def _run_tests(self):
+        tests = [self._validate_glossary()]
+        parent_tests = super(ReferencePageValidator, self)._run_tests()
+        return all(tests) and parent_tests
+
+
+class InstructorPageValidator(MarkdownValidator):
+    """Simple validator for Instructor's Guide- instructors.md"""
+    HEADINGS = ["Legend", "Overall"]
+    WARN_ON_EXTRA_HEADINGS = False
+
+    DOC_HEADERS = {"layout": vh.is_str,
+                   "title": vh.is_str,
+                   "subtitle": vh.is_str}
+
+
+class LicensePageValidator(MarkdownValidator):
+    """Validate LICENSE.md: user should not edit this file"""
+    def _run_tests(self):
+        """Skip the base tests; just check md5 hash"""
+        # TODO: This hash is specific to the license for english-language repo
+        expected_hash = '258aa6822fa77f7c49c37c3759017891'
+        m = hashlib.md5()
+        try:
+            m.update(self.markdown)
+        except TypeError:
+            # Workaround for hashing in python3
+            m.update(self.markdown.encode('utf-8'))
+
+        if m.hexdigest() == expected_hash:
+            return True
+        else:
+            logging.error("The provided license file should not be modified.")
+            return False
+
+
+class DiscussionPageValidator(MarkdownValidator):
     """
-    # Pair of regex and function to call
-    CONTROL = (
-        ("[0-9]{2}-.*", check_lesson),
-        ("discussion",  check_discussion),
-        ("index",       check_index),
-        ("instructors", check_intructors),
-        ("motivation",  check_motivation),
-        ("reference",   check_reference)
-    )
-    for (pattern, checker) in CONTROL:
-        if re.search(pattern, file_path):
-            checker(file_path)
-
-def main(list_of_files):
+    Validate the discussion page (discussion.md).
+    Most of the content is free-form.
     """
-    Call the check function for every file in ``list_of_files``.
+    WARN_ON_EXTRA_HEADINGS = False
+    DOC_HEADERS = {"layout": vh.is_str,
+                   "title": vh.is_str,
+                   "subtitle": vh.is_str}
 
-    If ``list_of_files`` is empty load all the files from ``pages`` directory.
-    """
-    if not list_of_files:
-        list_of_files = [os.path.join("pages", filename) for filename in os.listdir("pages")]
 
-    for filename in list_of_files:
-        if filename.endswith(".md"):
-            check_file(filename)
+# Associate lesson template names with validators. This list used by CLI.
+#   Dict of {name: (Validator, filename_pattern)}
+LESSON_TEMPLATES = {"index": (IndexPageValidator, "^index"),
+                    "topic": (TopicPageValidator, "^[0-9]{2}-.*"),
+                    "motivation": (MotivationPageValidator, "^motivation"),
+                    "reference": (ReferencePageValidator, "^reference"),
+                    "instructor": (InstructorPageValidator, "^instructors"),
+                    "license": (LicensePageValidator, "^LICENSE"),
+                    "discussion": (DiscussionPageValidator, "^discussion")}
+
+
+def identify_template(filepath):
+    """Identify template
+
+    Given the path to a single file,
+    identify the appropriate template to use"""
+    for template_name, (validator, pattern) in LESSON_TEMPLATES.items():
+        if re.search(pattern, os.path.basename(filepath)):
+            return template_name
+
+    return None
+
+
+def validate_single(filepath, template=None):
+    """Validate a single Markdown file based on a specified template"""
+    template = template or identify_template(filepath)
+    if template is None:
+        logging.error(
+            "Validation failed for {0}: "
+            "Could not automatically identify correct template.".format(
+                filepath))
+        return False
+
+    logging.debug(
+        "Beginning validation of {0} using template {1}".format(
+            filepath, template))
+    validator = LESSON_TEMPLATES[template][0]
+    validate_file = validator(filepath)
+
+    res = validate_file.validate()
+    if res is True:
+        logging.debug("File {0} successfully passed validation".format(
+            filepath))
+    else:
+        logging.debug("File {0} failed validation: "
+                      "see error log for details".format(filepath))
+
+    return res
+
+
+def validate_folder(path, template=None):
+    """Validate an entire folder of files"""
+    search_str = os.path.join(path, "*.md")  # Find files based on extension
+    filename_list = glob.glob(search_str)
+
+    if not filename_list:
+        logging.error(
+            "No Markdown files were found "
+            "in specified directory {0}".format(path))
+        return False
+
+    all_valid = True
+    for fn in filename_list:
+        res = validate_single(fn, template=template)
+        all_valid = all_valid and res
+    return all_valid
+
+
+def start_logging(level=logging.INFO):
+    """Initialize logging and print messages to console."""
+    logging.basicConfig(stream=sys.stdout, level=level)
+
+
+def command_line():
+    """Handle arguments passed in via the command line"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file_or_path",
+                        nargs="*",
+                        default=[os.getcwd()],
+                        help="The individual pathname")
+
+    parser.add_argument('-t', '--template',
+                        choices=LESSON_TEMPLATES.keys(),
+                        help="The type of template to apply to all file(s). "
+                             "If not specified, will auto-identify template.")
+
+    parser.add_argument('-d', '--debug',
+                        action='store_true',
+                        help="Enable debug information.")
+
+    return parser.parse_args()
+
+
+def main(parsed_args_obj):
+    if parsed_args_obj.debug:
+        log_level = "DEBUG"
+    else:
+        log_level = "WARNING"
+    start_logging(log_level)
+
+    template = parsed_args_obj.template
+
+    all_valid = True
+    for fn in parsed_args_obj.file_or_path:
+        if os.path.isdir(fn):
+            res = validate_folder(fn, template=template)
+        elif os.path.isfile(fn):
+            res = validate_single(fn, template=template)
+        else:
+            res = False
+            logging.error(
+                "The specified file or folder {0} does not exist; "
+                "could not perform validation".format(fn))
+
+        all_valid = all_valid and res
+
+    if all_valid is True:
+        logging.debug("All Markdown files successfully passed validation.")
+        sys.exit(0)
+    else:
+        logging.warning(
+            "Some errors were encountered during validation. "
+            "See log for details.")
+        sys.exit(1)
+
 
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    parsed_args = command_line()
+    main(parsed_args)
+
+    #### Sample of how validator is used directly
+    # validator = HomePageValidator('../index.md')
+    # print validator.validate()
diff --git a/tools/test_check.py b/tools/test_check.py
new file mode 100644
index 0000000..451cf78
--- /dev/null
+++ b/tools/test_check.py
@@ -0,0 +1,366 @@
+#! /usr/bin/env python
+
+import imp, logging, os, unittest
+check = imp.load_source("check",  # Import non-.py file
+                        os.path.join(os.path.dirname(__file__), "check"))
+
+# Make log messages visible to help audit test failures
+check.start_logging(level=logging.DEBUG)
+
+
+class BaseTemplateTest(unittest.TestCase):
+    """Common methods for testing template validators"""
+    SAMPLE_FILE = "" # Path to a file that should pass all tests
+    VALIDATOR = check.MarkdownValidator
+
+    def setUp(self):
+        self.sample_validator = self.VALIDATOR(self.SAMPLE_FILE)
+
+    def _create_validator(self, markdown):
+        """Create validator object from markdown string; useful for failures"""
+        return self.VALIDATOR(markdown=markdown)
+
+
+class TestAstHelpers(BaseTemplateTest):
+    SAMPLE_FILE = '../pages/index.md'
+    VALIDATOR = check.MarkdownValidator
+
+    def test_link_text_extracted(self):
+        """Verify that link text and destination are extracted correctly"""
+        validator = self._create_validator("""[This is a link](discussion.html)""")
+        links = validator.ast.find_external_links(validator.ast.children[0])
+
+        dest, link_text = validator.ast.get_link_info(links[0])
+        self.assertEqual(dest, "discussion.html")
+        self.assertEqual(link_text, "This is a link")
+
+
+class TestIndexPage(BaseTemplateTest):
+    """Test the ability to correctly identify and validate specific sections
+        of a markdown file"""
+    SAMPLE_FILE = "../pages/index.md"
+    VALIDATOR = check.IndexPageValidator
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+    def test_headers_missing_hrs(self):
+        validator = self._create_validator("""Blank row
+
+layout: lesson
+title: Lesson Title
+keywords: ["some", "key terms", "in a list"]
+
+Another section that isn't an HR
+""")
+
+        self.assertFalse(validator._validate_doc_headers())
+
+    def test_headers_missing_a_line(self):
+        """One of the required headers is missing"""
+        validator = self._create_validator("""---
+layout: lesson
+keywords: ["some", "key terms", "in a list"]
+---""")
+        self.assertFalse(validator._validate_doc_headers())
+
+    # TESTS INVOLVING DOCUMENT HEADER SECTION
+    def test_headers_fail_with_other_content(self):
+        validator = self._create_validator("""---
+layout: lesson
+title: Lesson Title
+keywords: ["some", "key terms", "in a list"]
+otherline: Nothing
+---""")
+        self.assertFalse(validator._validate_doc_headers())
+
+    def test_headers_fail_because_invalid_content(self):
+        validator = self._create_validator("""---
+layout: lesson
+title: Lesson Title
+keywords: this is not a list
+---""")
+        self.assertFalse(validator._validate_doc_headers())
+
+    # TESTS INVOLVING SECTION TITLES/HEADINGS
+    def test_index_has_valid_section_headings(self):
+        """The provided index page"""
+        res = self.sample_validator._validate_section_heading_order()
+        self.assertTrue(res)
+
+    def test_index_fail_when_section_heading_absent(self):
+        res = self.sample_validator.ast.has_section_heading("Fake heading")
+        self.assertFalse(res)
+
+    def test_fail_when_section_heading_is_wrong_level(self):
+        """All headings must be exactly level 2"""
+        validator = self._create_validator("""---
+layout: page
+title: Lesson Title
+---
+Paragraph of introductory material.
+
+> ## Prerequisites
+>
+> A short paragraph describing what learners need to know
+> before tackling this lesson.
+
+### Topics
+
+1.  [Topic Title 1](01-one.html)
+2.  [Topic Title 2](02-two.html)
+
+## Other Resources
+
+*   [Motivation](motivation.html)
+*   [Reference Guide](reference.html)
+*   [Next Steps](discussion.html)
+*   [Instructor's Guide](instructors.html)""")
+        self.assertFalse(validator._validate_section_heading_order())
+
+
+    def test_fail_when_section_headings_in_wrong_order(self):
+        validator = self._create_validator("""---
+layout: lesson
+title: Lesson Title
+keywords: ["some", "key terms", "in a list"]
+---
+Paragraph of introductory material.
+
+> ## Prerequisites
+>
+> A short paragraph describing what learners need to know
+> before tackling this lesson.
+
+## Other Resources
+
+* [Motivation](motivation.html)
+* [Reference Guide](reference.html)
+* [Instructor's Guide](instructors.html)
+
+
+## Topics
+
+* [Topic Title 1](01-one.html)
+* [Topic Title 2](02-two.html)""")
+
+        self.assertFalse(validator._validate_section_heading_order())
+
+    def test_pass_when_prereq_section_has_correct_heading_level(self):
+        validator = self._create_validator("""---
+layout: lesson
+title: Lesson Title
+keywords: ["some", "key terms", "in a list"]
+---
+Paragraph of introductory material.
+
+> ## Prerequisites
+>
+> A short paragraph describing what learners need to know
+> before tackling this lesson.
+""")
+        self.assertTrue(validator._validate_intro_section())
+
+    def test_fail_when_prereq_section_has_incorrect_heading_level(self):
+        validator = self._create_validator("""---
+layout: lesson
+title: Lesson Title
+keywords: ["some", "key terms", "in a list"]
+---
+Paragraph of introductory material.
+
+> # Prerequisites
+>
+> A short paragraph describing what learners need to know
+> before tackling this lesson.
+""")
+        self.assertFalse(validator._validate_intro_section())
+
+    # TESTS INVOLVING LINKS TO OTHER CONTENT
+    def test_file_links_validate(self):
+        res = self.sample_validator._validate_links()
+        self.assertTrue(res)
+
+    def test_html_link_to_extant_md_file_passes(self):
+        """Verify that an HTML link with corresponding MD file will pass"""
+        validator = self._create_validator("""[Topic Title One](01-one.html)""")
+        self.assertTrue(validator._validate_links())
+
+    def test_html_link_with_anchor_to_extant_md_passes(self):
+        """Verify that link is identified correctly even if to page anchor
+
+        For now this just tests that the regex handles #anchors.
+         It doesn't validate that the named anchor exists in the md file
+        """
+        validator = self._create_validator("""[Topic Title One](01-one.html#anchor)""")
+        self.assertTrue(validator._validate_links())
+
+    def test_inpage_anchor_passes_validation(self):
+        """Links that reference anchors within the page should be ignored"""
+        # TODO: Revisit once anchor rules are available
+        validator = self._create_validator("""Most databases also support Booleans and date/time values;
+SQLite uses the integers 0 and 1 for the former, and represents the latter as discussed [earlier](#a:dates).""")
+        self.assertTrue(validator._validate_links())
+
+
+    def test_missing_markdown_file_fails_validation(self):
+        """Fail validation when an html file is linked without corresponding
+            markdown file"""
+        validator = self._create_validator("""[Broken link](nonexistent.html)""")
+        self.assertFalse(validator._validate_links())
+
+    def test_website_link_ignored_by_validator(self):
+        """Don't look for markdown if the file linked isn't local-
+            remote website links are ignored"""
+        validator = self._create_validator("""[Broken link](http://website.com/filename.html)""")
+        self.assertTrue(validator._validate_links())
+
+    def test_malformed_website_link_fails_validator(self):
+        """If the link isn't prefixed by http(s):// or ftp://, fail.
+         This is because there are a lot of edge cases in distinguishing
+            between filenames and URLs: err on the side of certainty."""
+        validator = self._create_validator("""[Broken link](www.website.com/filename.html)""")
+        self.assertFalse(validator._validate_links())
+
+    def test_finds_image_asset(self):
+        """Image asset is found"""
+        validator = self._create_validator(
+            """![this is the image's title](fig/example.svg "this is the image's alt text")""")
+        self.assertTrue(validator._validate_links())
+
+    def test_image_asset_not_found(self):
+        """Image asset can't be found if path is invalid"""
+        validator = self._create_validator(
+            """![this is the image's title](fig/exemple.svg "this is the image's alt text")""")
+        self.assertFalse(validator._validate_links())
+
+    def test_non_html_link_finds_csv(self):
+        """Look for CSV file in appropriate folder"""
+        validator = self._create_validator(
+            """Use [this CSV](data/data.csv) for the exercise.""")
+        self.assertTrue(validator._validate_links())
+
+    def test_non_html_links_are_path_sensitive(self):
+        """Fails to find CSV file with wrong path."""
+        validator = self._create_validator(
+            """Use [this CSV](data.csv) for the exercise.""")
+        self.assertFalse(validator._validate_links())
+
+
+class TestTopicPage(BaseTemplateTest):
+    """Verifies that the topic page validator works as expected"""
+    SAMPLE_FILE = "../pages/01-one.md"
+    VALIDATOR = check.TopicPageValidator
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+
+class TestMotivationPage(BaseTemplateTest):
+    """Verifies that the instructors page validator works as expected"""
+    SAMPLE_FILE = "../pages/motivation.md"
+    VALIDATOR = check.MotivationPageValidator
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+
+class TestReferencePage(BaseTemplateTest):
+    """Verifies that the reference page validator works as expected"""
+    SAMPLE_FILE = "../pages/reference.md"
+    VALIDATOR = check.ReferencePageValidator
+
+    def test_missing_glossary_definition(self):
+        validator = self._create_validator("")
+        self.assertFalse(validator._validate_glossary_entry(
+            ["Key word"]))
+
+    def test_missing_colon_at_glossary_definition(self):
+        validator = self._create_validator("")
+        self.assertFalse(validator._validate_glossary_entry(
+            ["Key word", "Definition of term"]))
+
+    def test_wrong_indentation_at_glossary_definition(self):
+        validator = self._create_validator("")
+        self.assertFalse(validator._validate_glossary_entry(
+            ["Key word", ": Definition of term"]))
+
+    def test_wrong_continuation_at_glossary_definition(self):
+        validator = self._create_validator("")
+        self.assertFalse(validator._validate_glossary_entry(
+            ["Key word", ":   Definition of term", "continuation"]))
+
+    def test_valid_glossary_definition(self):
+        validator = self._create_validator("")
+        self.assertTrue(validator._validate_glossary_entry(
+            ["Key word", ":   Definition of term", "    continuation"]))
+
+    def test_only_definitions_can_appear_after_glossary_heading(self):
+        validator = self._create_validator("""## Glossary
+
+Key Word 1
+:   Definition of first term
+
+Paragraph
+
+Key Word 2
+:   Definition of second term
+""")
+        self.assertFalse(validator._validate_glossary())
+
+    def test_glossary(self):
+        validator = self._create_validator("""## Glossary
+
+Key Word 1
+:   Definition of first term
+
+Key Word 2
+:   Definition of second term
+""")
+        self.assertTrue(validator._validate_glossary())
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+
+class TestInstructorPage(BaseTemplateTest):
+    """Verifies that the instructors page validator works as expected"""
+    SAMPLE_FILE = "../pages/instructors.md"
+    VALIDATOR = check.InstructorPageValidator
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+
+class TestLicensePage(BaseTemplateTest):
+    SAMPLE_FILE = '../pages/LICENSE.md'
+    VALIDATOR = check.LicensePageValidator
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+    def test_modified_file_fails_validation(self):
+        with open(self.SAMPLE_FILE, 'rU') as f:
+            orig_text = f.read()
+        mod_text = orig_text.replace("The", "the")
+        validator = self._create_validator(mod_text)
+        self.assertFalse(validator.validate())
+
+
+class TestDiscussionPage(BaseTemplateTest):
+    SAMPLE_FILE = '../pages/discussion.md'
+    VALIDATOR = check.DiscussionPageValidator
+
+    def test_sample_file_passes_validation(self):
+        res = self.sample_validator.validate()
+        self.assertTrue(res)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/validation_helpers.py b/tools/validation_helpers.py
new file mode 100644
index 0000000..6acc11c
--- /dev/null
+++ b/tools/validation_helpers.py
@@ -0,0 +1,202 @@
+#! /usr/bin/env python
+
+import json
+import logging
+import re
+import sys
+
+try:  # Hack to make codebase compatible with python 2 and 3
+  basestring
+except NameError:
+  basestring = str
+
+
+# Common validation functions
+def is_list(text):
+    """Validate whether the provided string can be converted to python list"""
+    text = text.strip()
+    try:
+        text_as_list = json.loads(text)
+    except ValueError:
+        logging.debug("Could not convert string to python object: {0}".format(text))
+        return False
+
+    return isinstance(text_as_list, list)
+
+
+def is_str(text):
+    """Validate whether the input is a non-blank python string"""
+    return isinstance(text, basestring) and len(text) > 0
+
+
+def is_numeric(text):
+    """Validate whether the string represents a number (including unicode)"""
+    try:
+        float(text)
+        return True
+    except ValueError:
+        return False
+
+
+#### Text cleanup functions, pre-validation
+def strip_attrs(s):
+    """Strip attributes of the form {.name} from a markdown title string"""
+    return re.sub(r"\s\{\..*?\}", "", s)
+
+
+def get_css_class(s):
+    """Return any and all CSS classes (when a line is suffixed by {.classname})
+    Returns empty list when """
+    return re.findall("\{\.(.*?)\}", s)
+
+
+### Helper objects
+class CommonMarkHelper(object):
+    """Basic helper functions for working with the internal abstract syntax
+    tree produced by CommonMark parser"""
+    def __init__(self, ast):
+        self.data = ast
+        self.children = self.data.children
+
+    def get_doc_header_title(self):
+        """Helper method for SWC templates: get the document title from
+        the YAML headers"""
+        doc_headers = self.data.children[1]  # Throw index error if none found
+
+        for s in doc_headers.strings:
+            label, contents = s.split(":", 1)
+            if label.lower() == "title":
+                return contents.strip()
+
+        # If title not found, return an empty string for display purposes
+        return ''
+
+    def get_doc_header_subtitle(self):
+        """Helper method for SWC templates: get the document title from
+        the YAML headers"""
+        doc_headers = self.data.children[1]  # Throw index error if none found
+
+        for s in doc_headers.strings:
+            label, contents = s.split(":", 1)
+            if label.lower() == "subtitle":
+                return contents.strip()
+
+        # If title not found, return an empty string for display purposes
+        return ''
+
+    def get_block_titled(self, title, heading_level=2, ast_node=None):
+        """Examine children. Return all children of the given node that:
+        a) are blockquoted elements, and
+        b) contain a heading with the specified text, at the specified level.
+        For example, this can be used to find the "Prerequisites" section
+            in index.md
+
+        Returns empty list if no appropriate node is found"""
+        if ast_node is None:
+            ast_node = self.data
+        return [n for n in ast_node.children
+                if self.is_block(n) and
+                self.has_section_heading(
+                    title,
+                    ast_node=n,
+                    heading_level=heading_level,
+                    show_msg=False)]
+
+    def get_section_headings(self, ast_node=None):
+        """Returns a list of ast nodes that are headings"""
+        if ast_node is None:
+            ast_node = self.data
+        return [n for n in ast_node.children if self.is_heading(n)]
+
+    def get_link_info(self, link_node):
+        """Given a link node, return the link title and destination"""
+        if not self.is_external(link_node):
+            raise TypeError("Cannot apply this method to something that is not a link")
+
+        dest = link_node.destination
+        try:
+            link_text = link_node.label[0].c
+        except:
+            link_text = None
+
+        return dest, link_text
+
+    def find_external_links(self, ast_node=None):
+        """Recursive function that locates all references to external content
+         under specified node. (links or images)"""
+        ast_node = ast_node or self.data
+
+        # Link can be node itself, or hiding in inline content
+        links = [n for n in ast_node.inline_content
+                 if self.is_external(n)]
+
+        if self.is_external(ast_node):
+            links.append(ast_node)
+
+        # Also look for links in sub-nodes
+        for n in ast_node.children:
+            links.extend(self.find_external_links(n))
+
+        return links
+
+    def has_section_heading(self, section_title, ast_node=None,
+                            heading_level=2, limit=sys.maxsize, show_msg=True):
+        """Does the file contain (<= x copies of) specified heading text?
+        Will strip off any CSS attributes when looking for the section title"""
+        if ast_node is None:
+            ast_node = self.data
+
+        num_nodes = len([n for n in self.get_section_headings(ast_node)
+                         if (strip_attrs(n.strings[0]) == section_title)
+                         and (n.level == heading_level)])
+
+        # Suppress error msg if used as a helper method
+        if show_msg and num_nodes == 0:
+            logging.error("Document does not contain the specified "
+                          "heading: {0}".format(section_title))
+        elif show_msg and num_nodes > limit:
+            logging.error("Document must not contain more than {0} copies of"
+                          " the heading {1}".format(limit, section_title or 0))
+        elif show_msg:
+            logging.info("Verified that document contains the specified"
+                         " heading: {0}".format(section_title))
+        return (0 < num_nodes <= limit)
+
+    def has_number_children(self, ast_node,
+                            exact=None, minc=0, maxc=sys.maxsize):
+        """Does the specified node (such as a bulleted list) have the expected
+         number of children?"""
+
+        if exact:  # If specified, must have exactly this number of children
+            minc = maxc = exact
+
+        return (minc <= len(ast_node.children) <= maxc)
+
+    # Helpers, in case the evolving CommonMark spec changes the names of nodes
+    def is_hr(self, ast_node):
+        """Is the node a horizontal rule (hr)?"""
+        return ast_node.t == 'HorizontalRule'
+
+    def is_heading(self, ast_node):
+        """Is the node a heading/ title?"""
+        return ast_node.t == "ATXHeader"
+
+    def is_paragraph(self, ast_node):
+        """Is the node a paragraph?"""
+        return ast_node.t == "Paragraph"
+
+    def is_list(self, ast_node):
+        """Is the node a list? (ordered or unordered)"""
+        return ast_node.t == "List"
+
+    def is_link(self, ast_node):
+        """Is the node a link?"""
+        return ast_node.t == "Link"
+
+    def is_external(self, ast_node):
+        """Does the node reference content outside the file? (image or link)"""
+        return ast_node.t in ("Link", "Image")
+
+    def is_block(self, ast_node):
+        """Is the node a BlockQuoted element?"""
+        return ast_node.t == "BlockQuote"
-- 
GitLab