Markdown Content Validation and Automated Quality Assurance: Complete Guide for Documentation Excellence and Team Workflows
Automated Markdown content validation and quality assurance systems enable professional documentation teams to maintain consistency, accuracy, and excellence across large content repositories while reducing manual review overhead and ensuring compliance with organizational standards. By implementing comprehensive validation workflows, automated testing pipelines, and quality metrics dashboards, technical teams can scale documentation processes without sacrificing content quality or user experience.
Why Master Automated Content Validation?
Professional content validation provides essential benefits for documentation workflows:
- Consistency Enforcement: Automatically ensure adherence to style guides, formatting standards, and organizational conventions
- Error Prevention: Catch syntax errors, broken links, missing metadata, and content issues before publication
- Quality Metrics: Track documentation health through automated scoring, readability analysis, and compliance measurement
- Team Scalability: Enable distributed teams to contribute while maintaining unified quality standards
- Continuous Improvement: Identify patterns in content issues to refine processes and improve documentation systems
Foundation Validation Systems
Core Content Validation Framework
Building comprehensive validation systems for Markdown documentation:
# markdown_validator.py - Comprehensive content validation system
import re
import os
import yaml
import json
import requests
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
from enum import Enum
from urllib.parse import urlparse, urljoin
import subprocess
from datetime import datetime, timedelta
class ValidationSeverity(Enum):
ERROR = "error"
WARNING = "warning"
INFO = "info"
@dataclass
class ValidationResult:
rule_id: str
severity: ValidationSeverity
message: str
file_path: str
line_number: Optional[int] = None
column: Optional[int] = None
context: Optional[str] = None
fix_suggestion: Optional[str] = None
class ContentValidator:
def __init__(self, config_path: str = "validation_config.yaml"):
self.config = self.load_config(config_path)
self.results = []
self.file_cache = {}
def load_config(self, config_path: str) -> Dict:
"""Load validation configuration from YAML file"""
default_config = {
'rules': {
'frontmatter': {
'enabled': True,
'required_fields': ['title', 'description', 'date', 'author'],
'optional_fields': ['keywords', 'category', 'tags'],
'date_format': '%Y-%m-%d',
'title_max_length': 100,
'description_max_length': 200
},
'headings': {
'enabled': True,
'require_h1': True,
'max_heading_length': 80,
'no_duplicate_headings': True,
'heading_case': 'title', # title, sentence, lower, upper
'no_empty_headings': True
},
'links': {
'enabled': True,
'check_external': True,
'check_internal': True,
'allow_fragments': True,
'timeout': 10,
'retry_count': 2
},
'images': {
'enabled': True,
'require_alt_text': True,
'max_size_mb': 5,
'allowed_formats': ['jpg', 'jpeg', 'png', 'gif', 'webp', 'svg'],
'check_dimensions': True,
'min_width': 200,
'max_width': 2000
},
'content': {
'enabled': True,
'min_word_count': 100,
'max_line_length': 120,
'no_trailing_whitespace': True,
'consistent_list_markers': True,
'require_blank_lines': True
},
'code_blocks': {
'enabled': True,
'require_language_tags': True,
'validate_syntax': True,
'max_line_length': 100,
'no_hardcoded_secrets': True
},
'accessibility': {
'enabled': True,
'heading_hierarchy': True,
'link_text_descriptive': True,
'table_headers': True,
'color_contrast_warnings': True
}
},
'file_patterns': {
'include': ['**/*.md', '**/*.markdown'],
'exclude': ['node_modules/**', '.git/**', '_site/**', 'vendor/**']
},
'output': {
'format': 'json', # json, junit, checkstyle, sarif
'file': 'validation_results.json',
'console': True,
'details': True
}
}
if os.path.exists(config_path):
with open(config_path, 'r') as f:
user_config = yaml.safe_load(f)
# Merge with default config
return self.deep_merge(default_config, user_config)
return default_config
def deep_merge(self, base: Dict, override: Dict) -> Dict:
"""Deep merge configuration dictionaries"""
result = base.copy()
for key, value in override.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = self.deep_merge(result[key], value)
else:
result[key] = value
return result
def validate_file(self, file_path: str) -> List[ValidationResult]:
"""Validate a single Markdown file"""
self.results = []
if not os.path.exists(file_path):
self.results.append(ValidationResult(
rule_id='file_exists',
severity=ValidationSeverity.ERROR,
message=f'File does not exist: {file_path}',
file_path=file_path
))
return self.results
# Read and cache file content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.split('\n')
self.file_cache[file_path] = {'content': content, 'lines': lines}
# Run validation rules
self.validate_frontmatter(file_path, content, lines)
self.validate_headings(file_path, content, lines)
self.validate_links(file_path, content, lines)
self.validate_images(file_path, content, lines)
self.validate_content_structure(file_path, content, lines)
self.validate_code_blocks(file_path, content, lines)
self.validate_accessibility(file_path, content, lines)
return self.results
def validate_frontmatter(self, file_path: str, content: str, lines: List[str]):
"""Validate YAML frontmatter"""
if not self.config['rules']['frontmatter']['enabled']:
return
# Extract frontmatter
if not content.startswith('---'):
self.results.append(ValidationResult(
rule_id='frontmatter_missing',
severity=ValidationSeverity.ERROR,
message='Missing YAML frontmatter',
file_path=file_path,
line_number=1,
fix_suggestion='Add YAML frontmatter starting with ---'
))
return
# Find frontmatter boundaries
end_marker = None
for i, line in enumerate(lines[1:], 1):
if line.strip() == '---':
end_marker = i
break
if end_marker is None:
self.results.append(ValidationResult(
rule_id='frontmatter_malformed',
severity=ValidationSeverity.ERROR,
message='Frontmatter not properly closed',
file_path=file_path,
line_number=1,
fix_suggestion='Add closing --- after frontmatter'
))
return
# Parse frontmatter YAML
frontmatter_lines = lines[1:end_marker]
frontmatter_text = '\n'.join(frontmatter_lines)
try:
frontmatter = yaml.safe_load(frontmatter_text)
except yaml.YAMLError as e:
self.results.append(ValidationResult(
rule_id='frontmatter_yaml_error',
severity=ValidationSeverity.ERROR,
message=f'Invalid YAML in frontmatter: {e}',
file_path=file_path,
line_number=1,
context=frontmatter_text[:100]
))
return
if not isinstance(frontmatter, dict):
self.results.append(ValidationResult(
rule_id='frontmatter_not_dict',
severity=ValidationSeverity.ERROR,
message='Frontmatter must be a YAML dictionary',
file_path=file_path,
line_number=1
))
return
# Check required fields
required_fields = self.config['rules']['frontmatter']['required_fields']
for field in required_fields:
if field not in frontmatter or not frontmatter[field]:
self.results.append(ValidationResult(
rule_id=f'frontmatter_missing_{field}',
severity=ValidationSeverity.ERROR,
message=f'Missing required frontmatter field: {field}',
file_path=file_path,
line_number=self.find_field_line(frontmatter_lines, field),
fix_suggestion=f'Add {field}: "value" to frontmatter'
))
# Validate field constraints
if 'title' in frontmatter:
title_length = len(str(frontmatter['title']))
max_length = self.config['rules']['frontmatter']['title_max_length']
if title_length > max_length:
self.results.append(ValidationResult(
rule_id='frontmatter_title_too_long',
severity=ValidationSeverity.WARNING,
message=f'Title too long ({title_length} chars, max {max_length})',
file_path=file_path,
line_number=self.find_field_line(frontmatter_lines, 'title')
))
if 'description' in frontmatter:
desc_length = len(str(frontmatter['description']))
max_length = self.config['rules']['frontmatter']['description_max_length']
if desc_length > max_length:
self.results.append(ValidationResult(
rule_id='frontmatter_description_too_long',
severity=ValidationSeverity.WARNING,
message=f'Description too long ({desc_length} chars, max {max_length})',
file_path=file_path,
line_number=self.find_field_line(frontmatter_lines, 'description')
))
# Validate date format
if 'date' in frontmatter:
date_format = self.config['rules']['frontmatter']['date_format']
try:
datetime.strptime(str(frontmatter['date']), date_format)
except ValueError:
self.results.append(ValidationResult(
rule_id='frontmatter_invalid_date',
severity=ValidationSeverity.ERROR,
message=f'Invalid date format, expected {date_format}',
file_path=file_path,
line_number=self.find_field_line(frontmatter_lines, 'date'),
fix_suggestion=f'Use date format: {date_format}'
))
def find_field_line(self, frontmatter_lines: List[str], field: str) -> Optional[int]:
"""Find line number of a frontmatter field"""
for i, line in enumerate(frontmatter_lines, 2): # +2 for 0-index and --- line
if line.strip().startswith(f'{field}:'):
return i
return None
def validate_headings(self, file_path: str, content: str, lines: List[str]):
"""Validate heading structure and formatting"""
if not self.config['rules']['headings']['enabled']:
return
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
found_h1 = False
seen_headings = set()
for line_num, line in enumerate(lines, 1):
match = heading_pattern.match(line)
if not match:
continue
level = len(match.group(1))
heading_text = match.group(2).strip()
# Check for H1
if level == 1:
found_h1 = True
# Check for empty headings
if self.config['rules']['headings']['no_empty_headings'] and not heading_text:
self.results.append(ValidationResult(
rule_id='heading_empty',
severity=ValidationSeverity.ERROR,
message='Empty heading found',
file_path=file_path,
line_number=line_num,
context=line,
fix_suggestion='Add content to the heading or remove it'
))
continue
# Check heading length
max_length = self.config['rules']['headings']['max_heading_length']
if len(heading_text) > max_length:
self.results.append(ValidationResult(
rule_id='heading_too_long',
severity=ValidationSeverity.WARNING,
message=f'Heading too long ({len(heading_text)} chars, max {max_length})',
file_path=file_path,
line_number=line_num,
context=heading_text[:50] + '...' if len(heading_text) > 50 else heading_text
))
# Check for duplicate headings
if self.config['rules']['headings']['no_duplicate_headings']:
heading_lower = heading_text.lower()
if heading_lower in seen_headings:
self.results.append(ValidationResult(
rule_id='heading_duplicate',
severity=ValidationSeverity.WARNING,
message=f'Duplicate heading: {heading_text}',
file_path=file_path,
line_number=line_num,
context=heading_text
))
seen_headings.add(heading_lower)
# Check heading case
case_style = self.config['rules']['headings']['heading_case']
if case_style == 'title' and heading_text != heading_text.title():
self.results.append(ValidationResult(
rule_id='heading_case_title',
severity=ValidationSeverity.INFO,
message='Heading should use title case',
file_path=file_path,
line_number=line_num,
context=heading_text,
fix_suggestion=f'Change to: {heading_text.title()}'
))
# Check for required H1
if self.config['rules']['headings']['require_h1'] and not found_h1:
self.results.append(ValidationResult(
rule_id='heading_missing_h1',
severity=ValidationSeverity.ERROR,
message='Document missing H1 heading',
file_path=file_path,
line_number=1,
fix_suggestion='Add a main heading with # at the top of the document'
))
def validate_links(self, file_path: str, content: str, lines: List[str]):
"""Validate internal and external links"""
if not self.config['rules']['links']['enabled']:
return
# Find all markdown links
link_pattern = re.compile(r'\[([^\]]*)\]\(([^)]+)\)')
for line_num, line in enumerate(lines, 1):
for match in link_pattern.finditer(line):
link_text = match.group(1)
link_url = match.group(2)
# Skip empty links
if not link_url:
self.results.append(ValidationResult(
rule_id='link_empty_url',
severity=ValidationSeverity.ERROR,
message='Link with empty URL',
file_path=file_path,
line_number=line_num,
context=match.group(0)
))
continue
# Check link text
if not link_text:
self.results.append(ValidationResult(
rule_id='link_empty_text',
severity=ValidationSeverity.WARNING,
message='Link with empty text',
file_path=file_path,
line_number=line_num,
context=match.group(0),
fix_suggestion='Add descriptive link text'
))
# Validate internal links
if self.config['rules']['links']['check_internal'] and self.is_internal_link(link_url):
if not self.validate_internal_link(file_path, link_url):
self.results.append(ValidationResult(
rule_id='link_broken_internal',
severity=ValidationSeverity.ERROR,
message=f'Broken internal link: {link_url}',
file_path=file_path,
line_number=line_num,
context=match.group(0)
))
# Validate external links
if self.config['rules']['links']['check_external'] and self.is_external_link(link_url):
if not self.validate_external_link(link_url):
self.results.append(ValidationResult(
rule_id='link_broken_external',
severity=ValidationSeverity.WARNING,
message=f'Broken external link: {link_url}',
file_path=file_path,
line_number=line_num,
context=match.group(0)
))
def is_internal_link(self, url: str) -> bool:
"""Check if URL is an internal link"""
return not url.startswith(('http://', 'https://', 'mailto:', 'tel:', 'ftp://'))
def is_external_link(self, url: str) -> bool:
"""Check if URL is an external link"""
return url.startswith(('http://', 'https://'))
def validate_internal_link(self, base_file: str, link_url: str) -> bool:
"""Validate internal link exists"""
base_dir = os.path.dirname(base_file)
# Handle fragments
if '#' in link_url:
file_part, fragment = link_url.split('#', 1)
if not file_part: # Same-page fragment
return self.validate_fragment(base_file, fragment)
link_url = file_part
# Resolve relative path
target_path = os.path.normpath(os.path.join(base_dir, link_url))
# Check if file exists
if os.path.exists(target_path):
return True
# Try with .md extension if not present
if not target_path.endswith('.md'):
if os.path.exists(target_path + '.md'):
return True
return False
def validate_external_link(self, url: str) -> bool:
"""Validate external link is accessible"""
try:
timeout = self.config['rules']['links']['timeout']
retry_count = self.config['rules']['links']['retry_count']
for attempt in range(retry_count + 1):
try:
response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code < 400
except requests.RequestException:
if attempt == retry_count:
return False
continue
except Exception:
return False
def validate_fragment(self, file_path: str, fragment: str) -> bool:
"""Check if fragment/anchor exists in file"""
content = self.file_cache[file_path]['content']
# Look for heading that matches fragment
heading_pattern = re.compile(r'^#+\s+(.+)$', re.MULTILINE)
for match in heading_pattern.finditer(content):
heading_text = match.group(1)
# Convert heading to URL fragment format
heading_fragment = re.sub(r'[^a-zA-Z0-9\-_]', '-', heading_text.lower())
heading_fragment = re.sub(r'-+', '-', heading_fragment).strip('-')
if heading_fragment == fragment:
return True
return False
def validate_images(self, file_path: str, content: str, lines: List[str]):
"""Validate image references and properties"""
if not self.config['rules']['images']['enabled']:
return
image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
for line_num, line in enumerate(lines, 1):
for match in image_pattern.finditer(line):
alt_text = match.group(1)
image_url = match.group(2)
# Check alt text
if self.config['rules']['images']['require_alt_text'] and not alt_text:
self.results.append(ValidationResult(
rule_id='image_missing_alt',
severity=ValidationSeverity.WARNING,
message='Image missing alt text',
file_path=file_path,
line_number=line_num,
context=match.group(0),
fix_suggestion='Add descriptive alt text for accessibility'
))
# Validate image file
if self.is_internal_link(image_url):
self.validate_image_file(file_path, image_url, line_num, match.group(0))
def validate_image_file(self, base_file: str, image_url: str, line_num: int, context: str):
"""Validate image file properties"""
base_dir = os.path.dirname(base_file)
image_path = os.path.normpath(os.path.join(base_dir, image_url))
if not os.path.exists(image_path):
self.results.append(ValidationResult(
rule_id='image_file_missing',
severity=ValidationSeverity.ERROR,
message=f'Image file not found: {image_url}',
file_path=base_file,
line_number=line_num,
context=context
))
return
# Check file extension
allowed_formats = self.config['rules']['images']['allowed_formats']
file_ext = os.path.splitext(image_path)[1].lower().lstrip('.')
if file_ext not in allowed_formats:
self.results.append(ValidationResult(
rule_id='image_invalid_format',
severity=ValidationSeverity.WARNING,
message=f'Image format not allowed: {file_ext} (allowed: {", ".join(allowed_formats)})',
file_path=base_file,
line_number=line_num,
context=context
))
# Check file size
file_size_mb = os.path.getsize(image_path) / (1024 * 1024)
max_size = self.config['rules']['images']['max_size_mb']
if file_size_mb > max_size:
self.results.append(ValidationResult(
rule_id='image_too_large',
severity=ValidationSeverity.WARNING,
message=f'Image too large: {file_size_mb:.1f}MB (max {max_size}MB)',
file_path=base_file,
line_number=line_num,
context=context,
fix_suggestion='Optimize image size or use web-optimized format'
))
def validate_content_structure(self, file_path: str, content: str, lines: List[str]):
"""Validate overall content structure and formatting"""
if not self.config['rules']['content']['enabled']:
return
# Word count check
word_count = len(content.split())
min_words = self.config['rules']['content']['min_word_count']
if word_count < min_words:
self.results.append(ValidationResult(
rule_id='content_too_short',
severity=ValidationSeverity.INFO,
message=f'Content may be too short: {word_count} words (min {min_words})',
file_path=file_path,
line_number=1
))
# Line length check
max_line_length = self.config['rules']['content']['max_line_length']
for line_num, line in enumerate(lines, 1):
if len(line) > max_line_length:
self.results.append(ValidationResult(
rule_id='line_too_long',
severity=ValidationSeverity.INFO,
message=f'Line too long: {len(line)} chars (max {max_line_length})',
file_path=file_path,
line_number=line_num,
context=line[:50] + '...' if len(line) > 50 else line
))
# Trailing whitespace check
if self.config['rules']['content']['no_trailing_whitespace']:
for line_num, line in enumerate(lines, 1):
if line.endswith(' ') or line.endswith('\t'):
self.results.append(ValidationResult(
rule_id='trailing_whitespace',
severity=ValidationSeverity.INFO,
message='Line has trailing whitespace',
file_path=file_path,
line_number=line_num,
fix_suggestion='Remove trailing whitespace'
))
def validate_code_blocks(self, file_path: str, content: str, lines: List[str]):
"""Validate code blocks and syntax"""
if not self.config['rules']['code_blocks']['enabled']:
return
code_block_pattern = re.compile(r'^```(\w*)$')
in_code_block = False
code_language = None
code_start_line = None
for line_num, line in enumerate(lines, 1):
match = code_block_pattern.match(line)
if match and not in_code_block:
# Start of code block
in_code_block = True
code_language = match.group(1)
code_start_line = line_num
# Check for language tag
if self.config['rules']['code_blocks']['require_language_tags'] and not code_language:
self.results.append(ValidationResult(
rule_id='code_block_missing_language',
severity=ValidationSeverity.WARNING,
message='Code block missing language tag',
file_path=file_path,
line_number=line_num,
context=line,
fix_suggestion='Add language identifier after ```'
))
elif match and in_code_block:
# End of code block
in_code_block = False
code_language = None
code_start_line = None
elif in_code_block:
# Inside code block - check for secrets
if self.config['rules']['code_blocks']['no_hardcoded_secrets']:
self.check_for_secrets(file_path, line, line_num)
# Check for unclosed code blocks
if in_code_block:
self.results.append(ValidationResult(
rule_id='code_block_unclosed',
severity=ValidationSeverity.ERROR,
message='Unclosed code block',
file_path=file_path,
line_number=code_start_line,
fix_suggestion='Add closing ``` to close the code block'
))
def check_for_secrets(self, file_path: str, line: str, line_num: int):
"""Check for hardcoded secrets in code"""
secret_patterns = [
r'(?i)(password|pwd|pass)\s*[=:]\s*["\']?[a-zA-Z0-9!@#$%^&*()_+\-=\[\]{};\':"\\|,.<>/?]{8,}',
r'(?i)(api[_-]?key|apikey)\s*[=:]\s*["\']?[a-zA-Z0-9]{20,}',
r'(?i)(secret|token)\s*[=:]\s*["\']?[a-zA-Z0-9]{16,}',
r'(?i)(private[_-]?key)\s*[=:]\s*["\']?-----BEGIN',
]
for pattern in secret_patterns:
if re.search(pattern, line):
self.results.append(ValidationResult(
rule_id='code_hardcoded_secret',
severity=ValidationSeverity.WARNING,
message='Possible hardcoded secret detected',
file_path=file_path,
line_number=line_num,
context=line[:50] + '...' if len(line) > 50 else line,
fix_suggestion='Use environment variables or placeholders for secrets'
))
break
def validate_accessibility(self, file_path: str, content: str, lines: List[str]):
"""Validate accessibility compliance"""
if not self.config['rules']['accessibility']['enabled']:
return
# Check heading hierarchy
if self.config['rules']['accessibility']['heading_hierarchy']:
self.validate_heading_hierarchy(file_path, lines)
# Check descriptive link text
if self.config['rules']['accessibility']['link_text_descriptive']:
self.validate_descriptive_links(file_path, lines)
# Check table headers
if self.config['rules']['accessibility']['table_headers']:
self.validate_table_headers(file_path, lines)
def validate_heading_hierarchy(self, file_path: str, lines: List[str]):
"""Check proper heading hierarchy (no skipping levels)"""
heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
previous_level = 0
for line_num, line in enumerate(lines, 1):
match = heading_pattern.match(line)
if not match:
continue
current_level = len(match.group(1))
if previous_level > 0 and current_level > previous_level + 1:
self.results.append(ValidationResult(
rule_id='accessibility_heading_hierarchy',
severity=ValidationSeverity.WARNING,
message=f'Heading level skipped (h{previous_level} -> h{current_level})',
file_path=file_path,
line_number=line_num,
context=line,
fix_suggestion=f'Use h{previous_level + 1} instead of h{current_level}'
))
previous_level = current_level
def validate_descriptive_links(self, file_path: str, lines: List[str]):
"""Check for non-descriptive link text"""
link_pattern = re.compile(r'\[([^\]]*)\]\(([^)]+)\)')
non_descriptive = ['click here', 'read more', 'here', 'link', 'more', 'this']
for line_num, line in enumerate(lines, 1):
for match in link_pattern.finditer(line):
link_text = match.group(1).lower().strip()
if link_text in non_descriptive:
self.results.append(ValidationResult(
rule_id='accessibility_link_text',
severity=ValidationSeverity.INFO,
message=f'Non-descriptive link text: "{link_text}"',
file_path=file_path,
line_number=line_num,
context=match.group(0),
fix_suggestion='Use descriptive text that explains the link destination'
))
def validate_table_headers(self, file_path: str, lines: List[str]):
"""Check that tables have proper headers"""
in_table = False
table_start = None
for line_num, line in enumerate(lines, 1):
if '|' in line and line.strip().startswith('|') and line.strip().endswith('|'):
if not in_table:
in_table = True
table_start = line_num
# Check if next line is separator
if line_num < len(lines):
next_line = lines[line_num] # line_num is 1-indexed
if not re.match(r'^\|[\s\-\|:]+\|$', next_line.strip()):
self.results.append(ValidationResult(
rule_id='accessibility_table_headers',
severity=ValidationSeverity.INFO,
message='Table may be missing header row',
file_path=file_path,
line_number=table_start,
context=line,
fix_suggestion='Add header row with | --- | separators'
))
elif in_table and not '|' in line:
in_table = False
table_start = None
def generate_report(self) -> Dict:
"""Generate comprehensive validation report"""
total_results = len(self.results)
# Count by severity
severity_counts = {
ValidationSeverity.ERROR: 0,
ValidationSeverity.WARNING: 0,
ValidationSeverity.INFO: 0
}
# Count by rule
rule_counts = {}
# Group by file
file_results = {}
for result in self.results:
severity_counts[result.severity] += 1
rule_counts[result.rule_id] = rule_counts.get(result.rule_id, 0) + 1
if result.file_path not in file_results:
file_results[result.file_path] = []
file_results[result.file_path].append(result)
return {
'summary': {
'total_issues': total_results,
'errors': severity_counts[ValidationSeverity.ERROR],
'warnings': severity_counts[ValidationSeverity.WARNING],
'info': severity_counts[ValidationSeverity.INFO],
'files_with_issues': len(file_results),
'most_common_issues': sorted(rule_counts.items(), key=lambda x: x[1], reverse=True)[:5]
},
'results_by_file': file_results,
'all_results': [self.result_to_dict(r) for r in self.results],
'generated_at': datetime.now().isoformat()
}
def result_to_dict(self, result: ValidationResult) -> Dict:
"""Convert ValidationResult to dictionary"""
return {
'rule_id': result.rule_id,
'severity': result.severity.value,
'message': result.message,
'file_path': result.file_path,
'line_number': result.line_number,
'column': result.column,
'context': result.context,
'fix_suggestion': result.fix_suggestion
}
# CLI interface
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Validate Markdown content')
parser.add_argument('files', nargs='+', help='Files to validate')
parser.add_argument('--config', '-c', help='Configuration file path')
parser.add_argument('--output', '-o', help='Output file path')
parser.add_argument('--format', choices=['json', 'text'], default='text', help='Output format')
parser.add_argument('--severity', choices=['error', 'warning', 'info'], default='info', help='Minimum severity to report')
args = parser.parse_args()
validator = ContentValidator(args.config or 'validation_config.yaml')
all_results = []
for file_path in args.files:
results = validator.validate_file(file_path)
all_results.extend(results)
# Filter by severity
severity_order = {'error': 0, 'warning': 1, 'info': 2}
min_severity = severity_order[args.severity]
filtered_results = [r for r in all_results if severity_order[r.severity.value] <= min_severity]
if args.format == 'json':
validator.results = filtered_results
report = validator.generate_report()
output = json.dumps(report, indent=2)
else:
output_lines = []
for result in filtered_results:
line = f"{result.file_path}:{result.line_number or 0}: {result.severity.value}: {result.message}"
if result.context:
line += f" (context: {result.context})"
if result.fix_suggestion:
line += f" (fix: {result.fix_suggestion})"
output_lines.append(line)
output = '\n'.join(output_lines)
if args.output:
with open(args.output, 'w') as f:
f.write(output)
else:
print(output)
Advanced Validation Rules Engine
Creating extensible validation systems with custom rule definitions:
# validation_config.yaml - Comprehensive validation configuration
rules:
frontmatter:
enabled: true
required_fields: ['title', 'description', 'date', 'author']
optional_fields: ['keywords', 'category', 'tags', 'image', 'layout']
field_validation:
title:
max_length: 100
min_length: 10
pattern: '^[A-Z].*[^.]$' # Start with capital, no trailing period
no_generic_words: ['Guide', 'Tutorial', 'Documentation']
description:
max_length: 200
min_length: 50
unique_across_site: true
date:
format: '%Y-%m-%d'
not_future: true
not_older_than_days: 3650 # 10 years
keywords:
min_count: 3
max_count: 12
lowercase_only: true
no_duplicates: true
category:
allowed_values: ['Tutorial', 'Guide', 'Reference', 'Blog', 'News']
author:
required_format: 'First Last'
validate_against_team_list: true
content_quality:
enabled: true
readability:
min_flesch_reading_ease: 30
max_flesch_reading_ease: 90
warn_complex_sentences: true
max_sentence_length: 25
structure:
max_paragraph_length: 150
require_introduction_paragraph: true
require_conclusion_paragraph: true
max_heading_depth: 6
balanced_content_distribution: true
terminology:
consistent_spelling: true
technical_terms_glossary: 'glossary.yaml'
brand_terms_consistency: true
forbidden_terms: ['obviously', 'simply', 'just', 'easy']
links:
enabled: true
internal_links:
check_existence: true
allow_fragments: true
validate_anchors: true
prefer_relative_paths: true
warn_external_looking_internal: true
external_links:
check_accessibility: true
timeout: 15
retry_count: 3
check_redirects: true
warn_shortened_urls: true
allowed_domains: []
blocked_domains: ['bit.ly', 'tinyurl.com', 't.co']
check_ssl_certificates: true
images:
enabled: true
accessibility:
require_alt_text: true
alt_text_min_length: 5
alt_text_max_length: 125
no_filename_alt_text: true
descriptive_alt_text_check: true
technical:
max_file_size_mb: 5
allowed_formats: ['jpg', 'jpeg', 'png', 'gif', 'webp', 'svg']
min_dimensions: [200, 200]
max_dimensions: [2000, 2000]
check_image_optimization: true
require_responsive_images: true
organization:
preferred_directory: 'assets/images'
naming_convention: 'kebab-case'
require_organized_subdirectories: true
code_blocks:
enabled: true
syntax:
require_language_tags: true
validate_syntax_highlighting: true
supported_languages: ['javascript', 'python', 'bash', 'yaml', 'json', 'html', 'css', 'sql']
warn_unsupported_languages: true
security:
no_hardcoded_secrets: true
no_personal_information: true
warn_production_urls: true
scan_for_vulnerabilities: true
style:
max_line_length: 120
consistent_indentation: true
no_trailing_whitespace: true
require_comments_for_complex_code: true
accessibility:
enabled: true
headings:
proper_hierarchy: true
descriptive_text: true
no_generic_headings: true
max_heading_length: 60
links:
descriptive_text: true
no_generic_link_text: true
context_independent: true
keyboard_accessible: true
tables:
require_headers: true
require_captions: true
proper_markup: true
responsive_design: true
media:
alt_text_required: true
captions_for_videos: true
transcripts_for_audio: true
seo_optimization:
enabled: true
meta_tags:
title_length: [30, 60]
description_length: [120, 160]
keywords_relevance_check: true
unique_titles_across_site: true
content:
keyword_density_check: true
heading_keyword_optimization: true
internal_linking_score: true
content_freshness_warnings: true
technical:
structured_data_validation: true
canonical_urls: true
og_tags_validation: true
twitter_cards_validation: true
performance:
enabled: true
file_sizes:
max_markdown_size_kb: 500
warn_large_files: true
images:
optimization_check: true
format_recommendations: true
lazy_loading_suggestions: true
external_resources:
minimize_external_requests: true
cdn_usage_recommendations: true
custom_rules:
team_specific:
- name: 'brand_consistency'
description: 'Check for consistent brand terminology'
pattern: '(?i)(our product|the platform|this tool)'
suggestion: 'Use specific product names instead of generic terms'
severity: 'info'
- name: 'documentation_templates'
description: 'Ensure API documentation follows template structure'
applies_to: 'content/api/**/*.md'
required_sections: ['Overview', 'Parameters', 'Response', 'Examples']
severity: 'warning'
- name: 'changelog_format'
description: 'Validate changelog entry format'
applies_to: 'CHANGELOG.md'
entry_pattern: '^## \[\d+\.\d+\.\d+\] - \d{4}-\d{2}-\d{2}$'
severity: 'error'
file_patterns:
include: ['**/*.md', '**/*.markdown']
exclude:
- 'node_modules/**'
- '.git/**'
- '_site/**'
- 'vendor/**'
- '**/README.md' # Often have different standards
- 'docs/archive/**' # Archived content
output:
formats: ['json', 'junit', 'sarif', 'checkstyle']
console_output: true
detailed_reports: true
metrics_tracking: true
historical_comparison: true
quality_gates:
max_errors: 0
max_warnings: 10
min_quality_score: 80
required_checks: ['frontmatter', 'links', 'accessibility']
integrations:
github:
create_issues_for_errors: true
comment_on_pull_requests: true
status_checks: true
slack:
webhook_url: '${SLACK_WEBHOOK_URL}'
notify_on_quality_regression: true
email:
recipients: ['[email protected]']
daily_summary: true
CI/CD Integration and Automation
GitHub Actions Workflow
Implementing automated validation in continuous integration pipelines:
# .github/workflows/content-validation.yml - Comprehensive content validation workflow
name: Content Validation and Quality Assurance
on:
push:
branches: [main, develop]
paths:
- '**/*.md'
- '**/*.markdown'
- 'content/**'
- 'docs/**'
pull_request:
branches: [main, develop]
paths:
- '**/*.md'
- '**/*.markdown'
- 'content/**'
- 'docs/**'
schedule:
# Run comprehensive validation weekly
- cron: '0 2 * * 1'
env:
NODE_VERSION: '18'
PYTHON_VERSION: '3.9'
jobs:
# Fast validation for quick feedback
quick-validation:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 2 # Need depth for changed files detection
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Install validation tools
run: |
pip install -r requirements-validation.txt
npm install -g markdownlint-cli
npm install -g markdown-link-check
- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@v39
with:
files: |
**/*.md
**/*.markdown
files_ignore: |
node_modules/**
.git/**
- name: Run quick validation on changed files
if: steps.changed-files.outputs.any_changed == 'true'
run: |
echo "Changed files: ${{ steps.changed-files.outputs.all_changed_files }}"
python scripts/markdown_validator.py \
--config validation_config.yaml \
--format json \
--output validation_results.json \
--severity warning \
${{ steps.changed-files.outputs.all_changed_files }}
- name: Upload validation results
uses: actions/upload-artifact@v3
with:
name: quick-validation-results
path: validation_results.json
- name: Comment PR with results
if: github.event_name == 'pull_request' && steps.changed-files.outputs.any_changed == 'true'
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
try {
const results = JSON.parse(fs.readFileSync('validation_results.json', 'utf8'));
let comment = '## 📝 Content Validation Results\n\n';
comment += `**Summary:**\n`;
comment += `- Errors: ${results.summary.errors}\n`;
comment += `- Warnings: ${results.summary.warnings}\n`;
comment += `- Info: ${results.summary.info}\n`;
comment += `- Files checked: ${results.summary.files_with_issues}\n\n`;
if (results.summary.errors > 0) {
comment += '❌ **Validation failed** - please fix errors before merging.\n\n';
} else if (results.summary.warnings > 0) {
comment += '⚠️ **Validation passed with warnings** - consider addressing warnings.\n\n';
} else {
comment += '✅ **All validations passed!**\n\n';
}
if (results.summary.most_common_issues.length > 0) {
comment += '**Most common issues:**\n';
results.summary.most_common_issues.forEach(([rule, count]) => {
comment += `- ${rule}: ${count} occurrences\n`;
});
}
// Add details for errors and warnings
const importantResults = results.all_results.filter(r =>
r.severity === 'error' || r.severity === 'warning'
);
if (importantResults.length > 0 && importantResults.length <= 10) {
comment += '\n**Issues found:**\n';
importantResults.forEach(result => {
const icon = result.severity === 'error' ? '❌' : '⚠️';
const location = result.line_number ? `:${result.line_number}` : '';
comment += `${icon} **${result.file_path}${location}**: ${result.message}\n`;
if (result.fix_suggestion) {
comment += ` 💡 ${result.fix_suggestion}\n`;
}
});
} else if (importantResults.length > 10) {
comment += `\n**${importantResults.length} issues found** - see full report in workflow artifacts.\n`;
}
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
} catch (error) {
console.error('Failed to post validation results:', error);
}
# Comprehensive validation for main branches and scheduled runs
comprehensive-validation:
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop' || github.event_name == 'schedule'
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install comprehensive validation tools
run: |
# Python tools
pip install -r requirements-validation.txt
pip install textstat
pip install py-readability-metrics
# Node.js tools
npm install -g markdownlint-cli
npm install -g markdown-link-check
npm install -g alex
npm install -g write-good
# Additional tools
wget -O vale.tar.gz https://github.com/errata-ai/vale/releases/download/v2.25.0/vale_2.25.0_Linux_64-bit.tar.gz
tar -xzf vale.tar.gz
sudo mv vale /usr/local/bin/
- name: Run comprehensive validation
run: |
python scripts/comprehensive_validator.py \
--config validation_config.yaml \
--output-dir validation-reports/ \
--include-metrics \
--generate-trends \
--parallel-processing
- name: Run external tool validations
continue-on-error: true
run: |
# Markdown linting
markdownlint --config .markdownlint.json "**/*.md" > validation-reports/markdownlint.txt || true
# Link checking (external links only for comprehensive check)
find . -name "*.md" -exec markdown-link-check {} \; > validation-reports/link-check.txt || true
# Prose linting
alex . --why > validation-reports/alex-report.txt || true
write-good **/*.md > validation-reports/write-good.txt || true
# Style and grammar checking
vale --config vale.ini . > validation-reports/vale-report.txt || true
- name: Generate quality metrics dashboard
run: |
python scripts/generate_quality_dashboard.py \
--input validation-reports/ \
--output validation-reports/dashboard.html \
--include-trends \
--compare-with-baseline
- name: Upload comprehensive reports
uses: actions/upload-artifact@v3
with:
name: comprehensive-validation-reports
path: validation-reports/
retention-days: 30
- name: Update quality metrics
if: github.ref == 'refs/heads/main'
run: |
python scripts/update_quality_metrics.py \
--results validation-reports/comprehensive-results.json \
--metrics-file docs/quality-metrics.json \
--trend-data docs/quality-trends.json
- name: Commit updated metrics
if: github.ref == 'refs/heads/main'
run: |
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
if git diff --quiet docs/quality-metrics.json docs/quality-trends.json; then
echo "No metrics changes to commit"
else
git add docs/quality-metrics.json docs/quality-trends.json
git commit -m "Update documentation quality metrics [skip ci]"
git push
fi
# Security and compliance scanning
security-validation:
runs-on: ubuntu-latest
if: github.event_name != 'pull_request' # Skip for PRs to avoid secrets exposure
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install security scanning tools
run: |
pip install detect-secrets
pip install bandit
npm install -g secretlint
- name: Scan for secrets in documentation
run: |
detect-secrets scan --all-files --baseline .secrets.baseline
secretlint "**/*.md" --format json --output security-scan.json || true
- name: Check for compliance violations
run: |
python scripts/compliance_checker.py \
--config compliance_config.yaml \
--output compliance-report.json
- name: Upload security scan results
uses: actions/upload-artifact@v3
with:
name: security-scan-results
path: |
security-scan.json
compliance-report.json
# Performance and optimization analysis
performance-analysis:
runs-on: ubuntu-latest
if: github.event_name == 'schedule'
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Analyze content performance
run: |
python scripts/performance_analyzer.py \
--input-dir . \
--output performance-analysis.json \
--check-file-sizes \
--analyze-image-optimization \
--check-external-dependencies
- name: Generate optimization recommendations
run: |
python scripts/optimization_recommender.py \
--analysis performance-analysis.json \
--output optimization-recommendations.md
- name: Create optimization issue
if: github.event_name == 'schedule'
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
try {
const recommendations = fs.readFileSync('optimization-recommendations.md', 'utf8');
if (recommendations.trim()) {
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `📈 Weekly Performance Optimization Recommendations - ${new Date().toISOString().split('T')[0]}`,
body: recommendations,
labels: ['documentation', 'performance', 'automated']
});
}
} catch (error) {
console.error('Failed to create optimization issue:', error);
}
# Quality gates and deployment checks
quality-gates:
runs-on: ubuntu-latest
needs: [quick-validation, comprehensive-validation]
if: always() && (needs.quick-validation.result == 'success' || needs.comprehensive-validation.result == 'success')
steps:
- name: Download validation results
uses: actions/download-artifact@v3
with:
path: validation-results/
- name: Check quality gates
run: |
python scripts/quality_gates_checker.py \
--results-dir validation-results/ \
--config validation_config.yaml \
--output quality-gates.json
- name: Set deployment status
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
try {
const gates = JSON.parse(fs.readFileSync('quality-gates.json', 'utf8'));
const state = gates.passed ? 'success' : 'failure';
const description = gates.passed ?
'All quality gates passed' :
`Quality gates failed: ${gates.failures.join(', ')}`;
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: state,
target_url: `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: description,
context: 'Content Quality Gates'
});
core.setOutput('quality_gates_passed', gates.passed);
if (!gates.passed) {
core.setFailed(`Quality gates failed: ${gates.failures.join(', ')}`);
}
} catch (error) {
console.error('Failed to process quality gates:', error);
core.setFailed('Error processing quality gates');
}
Integration with Content Management
Content validation systems integrate seamlessly with modern documentation workflows. When combined with version control and Git integration systems, automated validation ensures that content quality standards are maintained across collaborative development processes while providing immediate feedback to contributors.
For comprehensive documentation systems, validation complements form creation and user interaction features by ensuring that generated content and user-contributed documentation maintains consistent quality standards and accessibility compliance across all content types.
When building sophisticated content architectures, validation works effectively with automated content linking and cross-referencing systems to maintain content integrity, validate link relationships, and ensure that automated content generation produces high-quality, maintainable documentation.
Advanced Quality Metrics and Reporting
Comprehensive Quality Dashboard
Building interactive dashboards for content quality monitoring:
# quality_dashboard.py - Interactive quality metrics dashboard
import json
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import dash
from dash import dcc, html, dash_table
from dash.dependencies import Input, Output, State
from datetime import datetime, timedelta
import numpy as np
from typing import Dict, List, Any
class QualityDashboard:
def __init__(self, metrics_file: str, trends_file: str):
self.app = dash.Dash(__name__)
self.metrics_data = self.load_metrics(metrics_file)
self.trends_data = self.load_trends(trends_file)
self.setup_layout()
self.setup_callbacks()
def load_metrics(self, file_path: str) -> Dict:
"""Load current quality metrics"""
try:
with open(file_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
return self.get_default_metrics()
def load_trends(self, file_path: str) -> List[Dict]:
"""Load historical trend data"""
try:
with open(file_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
return []
def get_default_metrics(self) -> Dict:
"""Return default metrics structure"""
return {
'overall_score': 85,
'total_files': 0,
'last_updated': datetime.now().isoformat(),
'categories': {
'content_quality': {'score': 85, 'issues': 0, 'weight': 0.3},
'accessibility': {'score': 90, 'issues': 0, 'weight': 0.2},
'seo': {'score': 80, 'issues': 0, 'weight': 0.2},
'technical': {'score': 88, 'issues': 0, 'weight': 0.15},
'security': {'score': 95, 'issues': 0, 'weight': 0.15}
},
'rule_performance': {},
'file_quality_distribution': {}
}
def setup_layout(self):
"""Setup dashboard layout"""
self.app.layout = html.Div([
# Header
html.Div([
html.H1("📊 Documentation Quality Dashboard",
className="dashboard-title"),
html.P(f"Last updated: {self.metrics_data.get('last_updated', 'Unknown')}",
className="last-updated"),
], className="header-section"),
# Summary Cards
html.Div([
self.create_summary_card("Overall Quality Score",
f"{self.metrics_data.get('overall_score', 0)}%",
self.get_score_color(self.metrics_data.get('overall_score', 0)),
"trending-up"),
self.create_summary_card("Total Files",
str(self.metrics_data.get('total_files', 0)),
"#3498db", "file-text"),
self.create_summary_card("Issues Found",
str(self.calculate_total_issues()),
"#e74c3c", "alert-triangle"),
self.create_summary_card("Files Passing",
f"{self.calculate_passing_percentage()}%",
"#2ecc71", "check-circle"),
], className="summary-cards"),
# Quality Score Breakdown
html.Div([
html.H2("Quality Score Breakdown"),
dcc.Graph(id="quality-breakdown-chart"),
], className="chart-section"),
# Trends Over Time
html.Div([
html.H2("Quality Trends"),
dcc.Graph(id="trends-chart"),
], className="chart-section"),
# Rule Performance
html.Div([
html.H2("Validation Rule Performance"),
dcc.Graph(id="rule-performance-chart"),
], className="chart-section"),
# File Quality Distribution
html.Div([
html.H2("File Quality Distribution"),
dcc.Graph(id="file-quality-chart"),
], className="chart-section"),
# Detailed Issues Table
html.Div([
html.H2("Recent Issues"),
dash_table.DataTable(
id='issues-table',
columns=[
{"name": "File", "id": "file_path"},
{"name": "Rule", "id": "rule_id"},
{"name": "Severity", "id": "severity"},
{"name": "Message", "id": "message"},
{"name": "Fix Suggestion", "id": "fix_suggestion"}
],
data=self.get_recent_issues(),
style_cell={'textAlign': 'left', 'overflow': 'hidden',
'textOverflow': 'ellipsis', 'maxWidth': 0},
style_data_conditional=[
{
'if': {'filter_query': '{severity} = error'},
'backgroundColor': '#ffebee',
'color': 'black',
},
{
'if': {'filter_query': '{severity} = warning'},
'backgroundColor': '#fff3e0',
'color': 'black',
}
],
page_size=20,
sort_action="native",
filter_action="native"
),
], className="table-section"),
], className="dashboard-container")
def create_summary_card(self, title: str, value: str, color: str, icon: str) -> html.Div:
"""Create a summary card component"""
return html.Div([
html.Div([
html.I(className=f"fas fa-{icon}", style={"color": color, "fontSize": "2rem"}),
], className="card-icon"),
html.Div([
html.H3(title, className="card-title"),
html.P(value, className="card-value", style={"color": color}),
], className="card-content"),
], className="summary-card")
def get_score_color(self, score: int) -> str:
"""Get color based on quality score"""
if score >= 90:
return "#2ecc71" # Green
elif score >= 75:
return "#f39c12" # Orange
else:
return "#e74c3c" # Red
def calculate_total_issues(self) -> int:
"""Calculate total issues across all categories"""
return sum(cat.get('issues', 0) for cat in self.metrics_data.get('categories', {}).values())
def calculate_passing_percentage(self) -> int:
"""Calculate percentage of files passing all checks"""
file_dist = self.metrics_data.get('file_quality_distribution', {})
total_files = sum(file_dist.values()) if file_dist else 1
passing_files = file_dist.get('excellent', 0) + file_dist.get('good', 0)
return int((passing_files / total_files) * 100) if total_files > 0 else 0
def get_recent_issues(self) -> List[Dict]:
"""Get recent issues for the table"""
# This would typically come from a more detailed validation results file
return [
{
"file_path": "docs/api/authentication.md",
"rule_id": "heading_too_long",
"severity": "warning",
"message": "Heading too long (85 chars, max 80)",
"fix_suggestion": "Shorten heading text"
},
{
"file_path": "content/guides/getting-started.md",
"rule_id": "link_broken_external",
"severity": "error",
"message": "Broken external link: https://example.com/api",
"fix_suggestion": "Update or remove broken link"
}
]
def setup_callbacks(self):
"""Setup interactive callbacks"""
@self.app.callback(
Output('quality-breakdown-chart', 'figure'),
Input('quality-breakdown-chart', 'id')
)
def update_quality_breakdown(chart_id):
categories = self.metrics_data.get('categories', {})
fig = go.Figure()
# Create radar chart for quality breakdown
categories_list = list(categories.keys())
scores = [categories[cat].get('score', 0) for cat in categories_list]
fig.add_trace(go.Scatterpolar(
r=scores,
theta=categories_list,
fill='toself',
name='Quality Scores',
line_color='rgb(106, 81, 163)'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 100]
)),
showlegend=True,
title="Quality Score by Category"
)
return fig
@self.app.callback(
Output('trends-chart', 'figure'),
Input('trends-chart', 'id')
)
def update_trends_chart(chart_id):
if not self.trends_data:
return go.Figure().add_annotation(
text="No trend data available",
xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False
)
df = pd.DataFrame(self.trends_data)
df['date'] = pd.to_datetime(df['date'])
fig = make_subplots(
rows=2, cols=1,
subplot_titles=('Overall Quality Score', 'Issues Count'),
vertical_spacing=0.1
)
# Overall score trend
fig.add_trace(
go.Scatter(
x=df['date'],
y=df['overall_score'],
mode='lines+markers',
name='Quality Score',
line=dict(color='#3498db', width=3)
),
row=1, col=1
)
# Issues count trend
fig.add_trace(
go.Scatter(
x=df['date'],
y=df['total_issues'],
mode='lines+markers',
name='Total Issues',
line=dict(color='#e74c3c', width=3)
),
row=2, col=1
)
fig.update_layout(height=600, showlegend=True)
return fig
@self.app.callback(
Output('rule-performance-chart', 'figure'),
Input('rule-performance-chart', 'id')
)
def update_rule_performance(chart_id):
rule_perf = self.metrics_data.get('rule_performance', {})
if not rule_perf:
return go.Figure().add_annotation(
text="No rule performance data available",
xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False
)
rules = list(rule_perf.keys())
issue_counts = [rule_perf[rule].get('issues', 0) for rule in rules]
fig = go.Figure(data=[
go.Bar(
x=issue_counts,
y=rules,
orientation='h',
marker_color=['#e74c3c' if count > 10 else '#f39c12' if count > 5 else '#2ecc71'
for count in issue_counts]
)
])
fig.update_layout(
title="Issues by Validation Rule",
xaxis_title="Number of Issues",
yaxis_title="Validation Rule",
height=max(400, len(rules) * 30)
)
return fig
@self.app.callback(
Output('file-quality-chart', 'figure'),
Input('file-quality-chart', 'id')
)
def update_file_quality_chart(chart_id):
file_dist = self.metrics_data.get('file_quality_distribution', {})
if not file_dist:
return go.Figure().add_annotation(
text="No file quality distribution data available",
xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False
)
labels = list(file_dist.keys())
values = list(file_dist.values())
colors = {
'excellent': '#2ecc71',
'good': '#3498db',
'fair': '#f39c12',
'poor': '#e74c3c',
'critical': '#8e44ad'
}
fig = go.Figure(data=[
go.Pie(
labels=labels,
values=values,
marker_colors=[colors.get(label, '#95a5a6') for label in labels],
hole=0.3
)
])
fig.update_layout(
title="File Quality Distribution",
annotations=[dict(text='Files', x=0.5, y=0.5, font_size=20, showarrow=False)]
)
return fig
def run(self, debug: bool = True, port: int = 8050):
"""Run the dashboard"""
self.app.run_server(debug=debug, port=port)
# Usage
if __name__ == "__main__":
dashboard = QualityDashboard('docs/quality-metrics.json', 'docs/quality-trends.json')
dashboard.run()
Troubleshooting Common Validation Issues
Performance Optimization for Large Repositories
Problem: Validation takes too long on large documentation repositories
Solutions:
# performance_optimizer.py - Optimize validation for large repositories
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import asyncio
import aiohttp
import time
from functools import lru_cache
import pickle
from typing import List, Dict, Any
class OptimizedValidator:
def __init__(self, config: Dict):
self.config = config
self.cache = {}
self.session = None
def setup_caching(self):
"""Setup validation result caching"""
# File hash-based caching
import hashlib
def get_file_hash(file_path: str) -> str:
with open(file_path, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()
self.get_file_hash = get_file_hash
def validate_files_parallel(self, file_paths: List[str]) -> List[Dict]:
"""Validate files using parallel processing"""
cpu_count = mp.cpu_count()
chunk_size = max(1, len(file_paths) // cpu_count)
with ProcessPoolExecutor(max_workers=cpu_count) as executor:
# Split files into chunks for parallel processing
file_chunks = [file_paths[i:i + chunk_size]
for i in range(0, len(file_paths), chunk_size)]
# Submit validation tasks
futures = [executor.submit(self.validate_chunk, chunk)
for chunk in file_chunks]
# Collect results
all_results = []
for future in futures:
all_results.extend(future.result())
return all_results
def validate_chunk(self, file_paths: List[str]) -> List[Dict]:
"""Validate a chunk of files"""
validator = ContentValidator(self.config)
results = []
for file_path in file_paths:
# Check cache first
file_hash = self.get_file_hash(file_path)
cache_key = f"{file_path}:{file_hash}"
if cache_key in self.cache:
results.extend(self.cache[cache_key])
continue
# Validate file
file_results = validator.validate_file(file_path)
self.cache[cache_key] = file_results
results.extend(file_results)
return results
async def validate_external_links_async(self, links: List[str]) -> Dict[str, bool]:
"""Validate external links asynchronously"""
if not self.session:
timeout = aiohttp.ClientTimeout(total=10)
self.session = aiohttp.ClientSession(timeout=timeout)
async def check_link(session: aiohttp.ClientSession, url: str) -> tuple[str, bool]:
try:
async with session.head(url, allow_redirects=True) as response:
return url, response.status < 400
except:
return url, False
# Batch process links
tasks = [check_link(self.session, url) for url in links]
results = await asyncio.gather(*tasks, return_exceptions=True)
return {url: result for url, result in results if not isinstance(result, Exception)}
def incremental_validation(self, base_path: str, changed_files: List[str] = None) -> Dict:
"""Run incremental validation on changed files only"""
if changed_files is None:
# Get changed files from Git
import subprocess
result = subprocess.run(['git', 'diff', '--name-only', 'HEAD~1', 'HEAD'],
capture_output=True, text=True, cwd=base_path)
changed_files = result.stdout.strip().split('\n') if result.stdout else []
changed_files = [f for f in changed_files if f.endswith('.md')]
if not changed_files:
return {'message': 'No markdown files changed', 'results': []}
# Validate only changed files
results = self.validate_files_parallel(changed_files)
return {
'changed_files': changed_files,
'results': results,
'summary': self.generate_summary(results)
}
@lru_cache(maxsize=1000)
def cached_rule_check(self, rule_id: str, content_hash: str, rule_params: str) -> List[Dict]:
"""Cache expensive rule checks"""
# This would implement the actual rule logic
pass
def smart_link_checking(self, links: List[str]) -> Dict[str, bool]:
"""Smart link checking with caching and batching"""
# Group links by domain for efficient checking
from urllib.parse import urlparse
domain_groups = {}
for link in links:
domain = urlparse(link).netloc
if domain not in domain_groups:
domain_groups[domain] = []
domain_groups[domain].append(link)
# Check each domain group with appropriate delays
results = {}
for domain, domain_links in domain_groups.items():
domain_results = asyncio.run(
self.validate_external_links_async(domain_links)
)
results.update(domain_results)
# Add delay between domains to be respectful
time.sleep(1)
return results
def generate_summary(self, results: List[Dict]) -> Dict:
"""Generate validation summary"""
severity_counts = {'error': 0, 'warning': 0, 'info': 0}
rule_counts = {}
for result in results:
severity_counts[result.get('severity', 'info')] += 1
rule_id = result.get('rule_id', 'unknown')
rule_counts[rule_id] = rule_counts.get(rule_id, 0) + 1
return {
'total_issues': len(results),
'by_severity': severity_counts,
'top_rules': sorted(rule_counts.items(), key=lambda x: x[1], reverse=True)[:5],
'validation_time': time.time()
}
# Usage example
if __name__ == "__main__":
config = {'rules': {'links': {'enabled': True}}} # Simplified config
optimizer = OptimizedValidator(config)
optimizer.setup_caching()
# Run incremental validation
results = optimizer.incremental_validation('.')
print(json.dumps(results, indent=2))
False Positive Management
Problem: Validation rules generating too many false positives
Solutions:
# false_positive_manager.py - Manage and reduce false positives
import re
import yaml
from typing import Dict, List, Set, Optional
from dataclasses import dataclass
@dataclass
class Suppression:
rule_id: str
file_pattern: Optional[str] = None
line_pattern: Optional[str] = None
reason: str = ""
expires: Optional[str] = None
class FalsePositiveManager:
def __init__(self, suppressions_file: str = "validation_suppressions.yaml"):
self.suppressions = self.load_suppressions(suppressions_file)
self.whitelist_patterns = self.load_whitelist_patterns()
def load_suppressions(self, file_path: str) -> List[Suppression]:
"""Load validation suppressions from YAML file"""
try:
with open(file_path, 'r') as f:
data = yaml.safe_load(f)
return [Suppression(**item) for item in data.get('suppressions', [])]
except FileNotFoundError:
return []
def load_whitelist_patterns(self) -> Dict[str, List[str]]:
"""Load rule-specific whitelist patterns"""
return {
'link_broken_external': [
r'https://example\.com.*', # Example URLs
r'https://placeholder\..*', # Placeholder URLs
r'https://.*\.localhost.*', # Local development URLs
],
'image_missing_alt': [
r'!\[.*\]\(.*spacer\.gif\)', # Spacer images
r'!\[.*\]\(.*pixel\.png\)', # Tracking pixels
],
'heading_generic': [
r'^(Overview|Introduction|Getting Started)$', # Allow common generic headings
]
}
def should_suppress(self, result: Dict, file_path: str, line_content: str) -> bool:
"""Check if a validation result should be suppressed"""
rule_id = result.get('rule_id')
# Check suppressions
for suppression in self.suppressions:
if suppression.rule_id != rule_id:
continue
# Check file pattern
if suppression.file_pattern:
if not re.search(suppression.file_pattern, file_path):
continue
# Check line pattern
if suppression.line_pattern:
if not re.search(suppression.line_pattern, line_content):
continue
# Check expiration
if suppression.expires:
from datetime import datetime
expire_date = datetime.fromisoformat(suppression.expires)
if datetime.now() > expire_date:
continue
return True
# Check whitelist patterns
patterns = self.whitelist_patterns.get(rule_id, [])
for pattern in patterns:
if re.search(pattern, result.get('context', '') or line_content):
return True
return False
def create_smart_suppressions(self, results: List[Dict], threshold: int = 5) -> List[Suppression]:
"""Automatically suggest suppressions for frequently occurring issues"""
# Count rule occurrences
rule_counts = {}
rule_contexts = {}
for result in results:
rule_id = result.get('rule_id')
context = result.get('context', '')
rule_counts[rule_id] = rule_counts.get(rule_id, 0) + 1
if rule_id not in rule_contexts:
rule_contexts[rule_id] = []
rule_contexts[rule_id].append(context)
# Suggest suppressions for high-frequency rules
suggestions = []
for rule_id, count in rule_counts.items():
if count >= threshold:
# Analyze contexts to find common patterns
contexts = rule_contexts[rule_id]
common_pattern = self.find_common_pattern(contexts)
if common_pattern:
suggestions.append(Suppression(
rule_id=rule_id,
line_pattern=common_pattern,
reason=f"Auto-suggested: {count} occurrences with common pattern"
))
return suggestions
def find_common_pattern(self, contexts: List[str]) -> Optional[str]:
"""Find common pattern in validation contexts"""
if len(contexts) < 2:
return None
# Simple pattern detection - find common prefixes/suffixes
if len(set(contexts)) == 1:
# All contexts are identical
return re.escape(contexts[0])
# Check for common URL patterns
if all('http' in ctx for ctx in contexts):
domains = [re.findall(r'https?://([^/]+)', ctx) for ctx in contexts]
common_domains = set(domains[0])
for domain_list in domains[1:]:
common_domains &= set(domain_list)
if common_domains:
domain = list(common_domains)[0]
return f'https?://{re.escape(domain)}.*'
return None
def filter_results(self, results: List[Dict], file_path: str, file_lines: List[str]) -> List[Dict]:
"""Filter validation results to remove suppressed issues"""
filtered_results = []
for result in results:
line_num = result.get('line_number', 0)
line_content = file_lines[line_num - 1] if line_num > 0 and line_num <= len(file_lines) else ""
if not self.should_suppress(result, file_path, line_content):
filtered_results.append(result)
return filtered_results
Conclusion
Automated Markdown content validation and quality assurance systems provide the foundation for maintaining professional documentation standards while enabling teams to scale their content creation processes efficiently. By implementing comprehensive validation frameworks, establishing quality metrics dashboards, and integrating automated checks into development workflows, organizations can ensure consistent, accessible, and high-quality documentation that serves users effectively while reducing manual oversight burden.
The key to successful validation implementation lies in balancing comprehensive coverage with practical usability, ensuring that validation rules enhance rather than hinder the content creation process. Whether you’re building internal documentation systems, open-source project documentation, or comprehensive knowledge bases, the techniques covered in this guide provide the tools necessary for creating robust, automated quality assurance systems that support both content creators and content consumers.
Remember to continuously refine your validation rules based on team feedback and real-world usage patterns, implement performance optimizations for large repositories, and maintain clear documentation of validation standards to enable effective collaboration. With proper implementation of automated content validation, your documentation can achieve and maintain professional standards while supporting rapid content development and team scalability.