src/server/http/repo/readme.rs

Ref: Size: 16.2 KiB
//! README lookup, decoding, and rendering for the repo overview page.
//!
//! Pure: no Axum / AppState dependencies, so this is unit-testable against
//! fixture repos built with `git2::Repository::init`.

use git2::Repository;

/// Already-safe HTML ready to be rendered with `|safe` in the template.
pub struct RenderedReadme {
    pub html: String,
}

const MAX_BLOB_BYTES: usize = 512 * 1024;
const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;

/// Load and render the README at the root of HEAD's tree.
///
/// Returns `None` when no README is present, the blob is binary or invalid
/// UTF-8, or any unexpected git2 error occurs. The README must never break
/// the overview page — failures degrade silently to "no README".
pub fn load_readme(
    repo: &Repository,
    repo_name: &str,
    branch: &str,
) -> Option<RenderedReadme> {
    // Resolve the branch ref directly so this works on bare repos where HEAD
    // may point to an unborn branch (e.g. `master` when only `main` is pushed).
    let obj = repo
        .revparse_single(&format!("refs/heads/{branch}"))
        .or_else(|_| repo.revparse_single(branch))
        .ok()?;
    let commit = obj.peel_to_commit().ok()?;
    let tree = commit.tree().ok()?;
    let entry = find_readme_entry(&tree)?;

    let blob = match repo.find_blob(entry.oid) {
        Ok(b) => b,
        Err(err) => {
            tracing::warn!(repo = repo_name, error = %err, "failed to load readme blob");
            return None;
        }
    };

    if blob.is_binary() {
        return None;
    }

    if blob.size() > MAX_BLOB_BYTES {
        return Some(RenderedReadme {
            html: too_large_notice_with_repo(repo_name, branch, &entry.name),
        });
    }

    let text = std::str::from_utf8(blob.content()).ok()?;

    let html = match entry.kind {
        ReadmeKind::Markdown => {
            let rendered = render_markdown(text);
            if rendered.len() > MAX_HTML_BYTES {
                too_large_notice_with_repo(repo_name, branch, &entry.name)
            } else {
                rendered
            }
        }
        ReadmeKind::Plain => {
            let rendered = render_plain(text);
            if rendered.len() > MAX_HTML_BYTES {
                too_large_notice_with_repo(repo_name, branch, &entry.name)
            } else {
                rendered
            }
        }
    };
    Some(RenderedReadme { html })
}

/// Build the "README too large" fallback HTML block. The link path is
/// assembled from `repo_name`, `branch`, and `file_name` after html-escaping.
/// We do NOT percent-encode: callers are expected to pass URL-safe values
/// (git branch and repo names that follow normal slug conventions, and one
/// of the fixed README filenames). Slashes in branch names are intentionally
/// preserved as path separators.
fn too_large_notice_with_repo(repo_name: &str, branch: &str, file_name: &str) -> String {
    format!(
        "<p><em>README too large to render. \
         <a href=\"/{repo}/blob/{branch}/{name}\">View raw</a>.</em></p>",
        repo = html_escape(repo_name),
        branch = html_escape(branch),
        name = html_escape(file_name),
    )
}

/// One root-tree entry. Carries the resolved category so the renderer can
/// branch on type without re-parsing the filename.
#[derive(Debug)]
struct ReadmeEntry {
    name: String,
    oid: git2::Oid,
    kind: ReadmeKind,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ReadmeKind {
    Markdown,
    Plain,
}

/// Walk the root tree (non-recursive) and pick the highest-precedence README
/// blob entry. Symlinks and tree entries are ignored.
fn find_readme_entry(tree: &git2::Tree) -> Option<ReadmeEntry> {
    // Score: lower is better. None means "not a README".
    fn classify(name: &str) -> Option<(u8, ReadmeKind)> {
        match name.to_ascii_lowercase().as_str() {
            "readme.md" => Some((0, ReadmeKind::Markdown)),
            "readme" => Some((1, ReadmeKind::Plain)),
            "readme.txt" => Some((2, ReadmeKind::Plain)),
            _ => None,
        }
    }

    let mut best: Option<(u8, ReadmeEntry)> = None;
    for entry in tree.iter() {
        // Skip subtrees, symlinks, submodules — only regular blobs.
        if entry.kind() != Some(git2::ObjectType::Blob) {
            continue;
        }
        if entry.filemode() != 0o100644 && entry.filemode() != 0o100755 {
            continue; // 0o120000 (symlink) and anything else
        }
        let name = match entry.name() {
            Some(n) => n,
            None => continue,
        };
        let (score, kind) = match classify(name) {
            Some(v) => v,
            None => continue,
        };
        let candidate = ReadmeEntry { name: name.to_string(), oid: entry.id(), kind };
        match &best {
            None => best = Some((score, candidate)),
            Some((cur_score, _)) if score < *cur_score => best = Some((score, candidate)),
            _ => {}
        }
    }
    best.map(|(_, e)| e)
}

fn render_markdown(src: &str) -> String {
    use pulldown_cmark::{Options, Parser, html};
    let mut opts = Options::empty();
    opts.insert(Options::ENABLE_TABLES);
    opts.insert(Options::ENABLE_STRIKETHROUGH);
    opts.insert(Options::ENABLE_TASKLISTS);
    let parser = Parser::new_ext(src, opts);
    let mut unsafe_html = String::new();
    html::push_html(&mut unsafe_html, parser);
    sanitize(&unsafe_html)
}

fn render_plain(src: &str) -> String {
    // Task 4 will tighten this. For now, escape and wrap.
    let escaped = html_escape(src);
    format!("<pre>{}</pre>", escaped)
}

fn html_escape(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&#39;")
}

fn sanitize(html: &str) -> String {
    use std::collections::HashSet;

    // Restrict the URL scheme allowlist used by ammonia for href/src and the
    // other URL-bearing attributes it tracks. This drops data: image URIs as
    // well as javascript: in any href, including <a href>, which are the
    // attack vectors that matter for a README.
    let mut schemes: HashSet<&str> = HashSet::new();
    schemes.insert("http");
    schemes.insert("https");
    schemes.insert("mailto");

    ammonia::Builder::default()
        .url_schemes(schemes)
        .clean(html)
        .to_string()
}

#[cfg(test)]
mod tests {
    use super::*;
    use git2::Repository;
    use tempfile::TempDir;

    fn load(repo: &Repository) -> Option<RenderedReadme> {
        let branch = repo.head().ok()
            .and_then(|h| h.shorthand().map(String::from))
            .unwrap_or_else(|| "main".to_string());
        load_readme(repo, "test-repo", &branch)
    }

    /// Build an empty git repo with a single commit containing the given
    /// (path, contents) blobs at the root tree. Returns the repo and the
    /// tempdir (kept alive by the caller).
    fn repo_with_files(files: &[(&str, &[u8])]) -> (Repository, TempDir) {
        let tmp = TempDir::new().unwrap();
        let repo = Repository::init(tmp.path()).unwrap();

        let sig = git2::Signature::now("Test", "test@example.com").unwrap();
        {
            let tree_oid = {
                let mut builder = repo.treebuilder(None).unwrap();
                for (name, contents) in files {
                    let oid = repo.blob(contents).unwrap();
                    builder.insert(name, oid, 0o100644).unwrap();
                }
                builder.write().unwrap()
            };
            let tree = repo.find_tree(tree_oid).unwrap();
            repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[]).unwrap();
        }

        (repo, tmp)
    }

    #[test]
    fn no_readme_returns_none() {
        let (repo, _tmp) = repo_with_files(&[("lib.rs", b"fn main() {}")]);
        assert!(load(&repo).is_none());
    }

    #[test]
    fn finds_uppercase_readme_md() {
        let (repo, _tmp) = repo_with_files(&[("README.md", b"# Title\n\nbody\n")]);
        let r = load(&repo).expect("README found");
        assert!(r.html.contains("<h1>Title</h1>"), "got: {}", r.html);
    }

    #[test]
    fn finds_mixed_case_readme_md() {
        let (repo, _tmp) = repo_with_files(&[("Readme.MD", b"# T\n")]);
        assert!(load(&repo).is_some());
    }

    #[test]
    fn md_wins_over_txt() {
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"# md\n"),
            ("README.txt", b"plain"),
        ]);
        let r = load(&repo).unwrap();
        assert!(r.html.contains("<h1>md</h1>"));
        assert!(!r.html.contains("plain"));
    }

    #[test]
    fn readme_wins_over_txt() {
        let (repo, _tmp) = repo_with_files(&[
            ("README", b"plain readme"),
            ("README.txt", b"plain txt"),
        ]);
        let r = load(&repo).unwrap();
        assert!(r.html.contains("plain readme"));
        assert!(!r.html.contains("plain txt"));
    }

    #[test]
    fn mixed_case_md_still_wins_over_lowercase_txt() {
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"# md\n"),
            ("readme.txt", b"plain"),
        ]);
        assert!(load(&repo).unwrap().html.contains("<h1>md</h1>"));
    }

    #[test]
    fn nested_readme_is_not_matched() {
        // Build a tree containing both:
        //   docs/README.md  (a subtree entry that must be ignored)
        //   README          (a real root-level README that must win)
        // This proves `find_readme_entry` does not recurse into subtrees,
        // and that subtree entries with README-like names are skipped at the
        // root iteration level.
        let tmp = TempDir::new().unwrap();
        let repo = Repository::init(tmp.path()).unwrap();
        let sig = git2::Signature::now("T", "t@e").unwrap();

        // The nested blob whose name happens to match a README.
        let nested_blob = repo.blob(b"# nested wins (it should not)\n").unwrap();
        let mut sub = repo.treebuilder(None).unwrap();
        sub.insert("README.md", nested_blob, 0o100644).unwrap();
        let sub_oid = sub.write().unwrap();

        // The real root README.
        let root_blob = repo.blob(b"actual root readme\n").unwrap();

        let mut root = repo.treebuilder(None).unwrap();
        root.insert("docs", sub_oid, 0o040000).unwrap();
        root.insert("README", root_blob, 0o100644).unwrap();
        let root_oid = root.write().unwrap();
        let tree = repo.find_tree(root_oid).unwrap();
        repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[]).unwrap();

        let r = load(&repo).expect("root README must be found");
        assert!(r.html.contains("actual root readme"));
        assert!(!r.html.contains("nested wins"));
    }

    #[test]
    fn plain_readme_escapes_script_tag() {
        let (repo, _tmp) = repo_with_files(&[
            ("README", b"<script>alert(1)</script>\nhello"),
        ]);
        let r = load(&repo).unwrap();
        assert!(r.html.starts_with("<pre>"));
        assert!(r.html.contains("&lt;script&gt;"));
        assert!(!r.html.contains("<script>"));
        assert!(r.html.contains("hello"));
    }

    #[test]
    fn symlink_readme_is_ignored() {
        let tmp = TempDir::new().unwrap();
        let repo = Repository::init(tmp.path()).unwrap();
        let sig = git2::Signature::now("T", "t@e").unwrap();

        // Symlinks are stored as a blob whose contents are the target path,
        // with file mode 0o120000.
        let target_blob = repo.blob(b"docs/REAL.md").unwrap();
        let mut root = repo.treebuilder(None).unwrap();
        root.insert("README.md", target_blob, 0o120000).unwrap();
        let root_oid = root.write().unwrap();
        let tree = repo.find_tree(root_oid).unwrap();
        repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[]).unwrap();

        assert!(load(&repo).is_none());
    }

    #[test]
    fn markdown_strips_script_tag() {
        // Passes because pulldown-cmark does not emit raw HTML without
        // ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"# t\n\n<script>alert(1)</script>\n"),
        ]);
        let html = load(&repo).unwrap().html;
        assert!(!html.contains("<script>"), "got: {}", html);
    }

    #[test]
    fn markdown_strips_javascript_href() {
        // Passes because pulldown-cmark does not emit raw HTML without
        // ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"[click](javascript:alert(1))\n"),
        ]);
        let html = load(&repo).unwrap().html;
        assert!(!html.contains("javascript:"), "got: {}", html);
    }

    #[test]
    fn markdown_strips_onerror_attribute() {
        // Passes because pulldown-cmark does not emit raw HTML without
        // ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"<img src=\"https://x/y.png\" onerror=\"alert(1)\">\n"),
        ]);
        let html = load(&repo).unwrap().html;
        assert!(!html.contains("onerror"), "got: {}", html);
    }

    #[test]
    fn markdown_strips_iframe() {
        // Passes because pulldown-cmark does not emit raw HTML without
        // ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"<iframe src=\"https://evil.example/\"></iframe>\n"),
        ]);
        let html = load(&repo).unwrap().html;
        assert!(!html.contains("<iframe"), "got: {}", html);
    }

    #[test]
    fn markdown_strips_data_image_uri() {
        let (repo, _tmp) = repo_with_files(&[
            ("README.md", b"<img src=\"data:image/png;base64,AAAA\">\n"),
        ]);
        let html = load(&repo).unwrap().html;
        // Either the whole <img> is dropped or the src attr is gone.
        assert!(!html.contains("data:"), "got: {}", html);
    }

    #[test]
    fn empty_readme_renders_empty_body() {
        let (repo, _tmp) = repo_with_files(&[("README.md", b"")]);
        let r = load(&repo).expect("present-but-empty is Some");
        // ammonia of empty markdown is the empty string; assert it's not the
        // too-large notice and not None.
        assert!(!r.html.contains("too large"));
    }

    #[test]
    fn oversized_blob_returns_too_large_notice() {
        let big = vec![b'x'; 600 * 1024];
        let (repo, _tmp) = repo_with_files(&[("README.md", &big)]);
        let r = load(&repo).unwrap();
        assert!(r.html.contains("too large"));
        assert!(r.html.contains("/blob/"));
        assert!(r.html.contains("README.md"));
    }

    #[test]
    fn markdown_bomb_post_render_cap_trips() {
        // Code-span paragraphs: "`a`\n\n" (5 bytes each) expand to
        // "<p><code>a</code></p>\n" (~22 bytes each) — a 4.4x ratio.
        // At 100_000 reps: source = 500_000 bytes < 512 KiB (524_288),
        // rendered ≈ 2_200_000 bytes > 2 MiB (2_097_152).
        // Calibrated empirically: ratio confirmed at 4.40x.
        let mut src = String::new();
        for _ in 0..100_000usize {
            src.push_str("`a`\n\n");
        }
        assert!(src.len() < MAX_BLOB_BYTES, "source {} >= blob cap {}", src.len(), MAX_BLOB_BYTES);
        let (repo, _tmp) = repo_with_files(&[("README.md", src.as_bytes())]);
        let r = load(&repo).unwrap();
        assert!(r.html.contains("too large"), "expected bomb to trip cap");
    }

    #[test]
    fn plain_text_post_render_cap_trips() {
        // 500 KiB of '&' → ~2.5 MiB of "&amp;" inside a <pre>.
        let bytes = vec![b'&'; 500 * 1024];
        let (repo, _tmp) = repo_with_files(&[("README", &bytes)]);
        let r = load(&repo).unwrap();
        assert!(r.html.contains("too large"), "expected plain-text bomb to trip cap");
    }

    #[test]
    fn binary_blob_returns_none() {
        let (repo, _tmp) = repo_with_files(&[("README.md", &[0u8, 1, 2, 3, 0xff, 0xfe])]);
        assert!(load(&repo).is_none());
    }

    #[test]
    fn invalid_utf8_returns_none() {
        // Mostly valid text + a stray 0x80 byte. Not flagged as binary by git2's
        // heuristic (no NULs), but not valid UTF-8 either.
        let mut bytes = Vec::from(&b"hello world\nmore text\n"[..]);
        bytes.push(0x80);
        bytes.extend_from_slice(b"\nmore\n");
        let (repo, _tmp) = repo_with_files(&[("README.md", &bytes)]);
        assert!(load(&repo).is_none());
    }
}