src/server/http/repo/readme.rs
Ref: Size: 16.2 KiB
//! README lookup, decoding, and rendering for the repo overview page.
//!
//! Pure: no Axum / AppState dependencies, so this is unit-testable against
//! fixture repos built with `git2::Repository::init`.
use git2::Repository;
/// Already-safe HTML ready to be rendered with `|safe` in the template.
pub struct RenderedReadme {
pub html: String,
}
const MAX_BLOB_BYTES: usize = 512 * 1024;
const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
/// Load and render the README at the root of HEAD's tree.
///
/// Returns `None` when no README is present, the blob is binary or invalid
/// UTF-8, or any unexpected git2 error occurs. The README must never break
/// the overview page — failures degrade silently to "no README".
pub fn load_readme(
repo: &Repository,
repo_name: &str,
branch: &str,
) -> Option<RenderedReadme> {
// Resolve the branch ref directly so this works on bare repos where HEAD
// may point to an unborn branch (e.g. `master` when only `main` is pushed).
let obj = repo
.revparse_single(&format!("refs/heads/{branch}"))
.or_else(|_| repo.revparse_single(branch))
.ok()?;
let commit = obj.peel_to_commit().ok()?;
let tree = commit.tree().ok()?;
let entry = find_readme_entry(&tree)?;
let blob = match repo.find_blob(entry.oid) {
Ok(b) => b,
Err(err) => {
tracing::warn!(repo = repo_name, error = %err, "failed to load readme blob");
return None;
}
};
if blob.is_binary() {
return None;
}
if blob.size() > MAX_BLOB_BYTES {
return Some(RenderedReadme {
html: too_large_notice_with_repo(repo_name, branch, &entry.name),
});
}
let text = std::str::from_utf8(blob.content()).ok()?;
let html = match entry.kind {
ReadmeKind::Markdown => {
let rendered = render_markdown(text);
if rendered.len() > MAX_HTML_BYTES {
too_large_notice_with_repo(repo_name, branch, &entry.name)
} else {
rendered
}
}
ReadmeKind::Plain => {
let rendered = render_plain(text);
if rendered.len() > MAX_HTML_BYTES {
too_large_notice_with_repo(repo_name, branch, &entry.name)
} else {
rendered
}
}
};
Some(RenderedReadme { html })
}
/// Build the "README too large" fallback HTML block. The link path is
/// assembled from `repo_name`, `branch`, and `file_name` after html-escaping.
/// We do NOT percent-encode: callers are expected to pass URL-safe values
/// (git branch and repo names that follow normal slug conventions, and one
/// of the fixed README filenames). Slashes in branch names are intentionally
/// preserved as path separators.
fn too_large_notice_with_repo(repo_name: &str, branch: &str, file_name: &str) -> String {
format!(
"<p><em>README too large to render. \
<a href=\"/{repo}/blob/{branch}/{name}\">View raw</a>.</em></p>",
repo = html_escape(repo_name),
branch = html_escape(branch),
name = html_escape(file_name),
)
}
/// One root-tree entry. Carries the resolved category so the renderer can
/// branch on type without re-parsing the filename.
#[derive(Debug)]
struct ReadmeEntry {
name: String,
oid: git2::Oid,
kind: ReadmeKind,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ReadmeKind {
Markdown,
Plain,
}
/// Walk the root tree (non-recursive) and pick the highest-precedence README
/// blob entry. Symlinks and tree entries are ignored.
fn find_readme_entry(tree: &git2::Tree) -> Option<ReadmeEntry> {
// Score: lower is better. None means "not a README".
fn classify(name: &str) -> Option<(u8, ReadmeKind)> {
match name.to_ascii_lowercase().as_str() {
"readme.md" => Some((0, ReadmeKind::Markdown)),
"readme" => Some((1, ReadmeKind::Plain)),
"readme.txt" => Some((2, ReadmeKind::Plain)),
_ => None,
}
}
let mut best: Option<(u8, ReadmeEntry)> = None;
for entry in tree.iter() {
// Skip subtrees, symlinks, submodules — only regular blobs.
if entry.kind() != Some(git2::ObjectType::Blob) {
continue;
}
if entry.filemode() != 0o100644 && entry.filemode() != 0o100755 {
continue; // 0o120000 (symlink) and anything else
}
let name = match entry.name() {
Some(n) => n,
None => continue,
};
let (score, kind) = match classify(name) {
Some(v) => v,
None => continue,
};
let candidate = ReadmeEntry { name: name.to_string(), oid: entry.id(), kind };
match &best {
None => best = Some((score, candidate)),
Some((cur_score, _)) if score < *cur_score => best = Some((score, candidate)),
_ => {}
}
}
best.map(|(_, e)| e)
}
fn render_markdown(src: &str) -> String {
use pulldown_cmark::{Options, Parser, html};
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_STRIKETHROUGH);
opts.insert(Options::ENABLE_TASKLISTS);
let parser = Parser::new_ext(src, opts);
let mut unsafe_html = String::new();
html::push_html(&mut unsafe_html, parser);
sanitize(&unsafe_html)
}
fn render_plain(src: &str) -> String {
// Task 4 will tighten this. For now, escape and wrap.
let escaped = html_escape(src);
format!("<pre>{}</pre>", escaped)
}
fn html_escape(s: &str) -> String {
s.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
fn sanitize(html: &str) -> String {
use std::collections::HashSet;
// Restrict the URL scheme allowlist used by ammonia for href/src and the
// other URL-bearing attributes it tracks. This drops data: image URIs as
// well as javascript: in any href, including <a href>, which are the
// attack vectors that matter for a README.
let mut schemes: HashSet<&str> = HashSet::new();
schemes.insert("http");
schemes.insert("https");
schemes.insert("mailto");
ammonia::Builder::default()
.url_schemes(schemes)
.clean(html)
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
use git2::Repository;
use tempfile::TempDir;
fn load(repo: &Repository) -> Option<RenderedReadme> {
let branch = repo.head().ok()
.and_then(|h| h.shorthand().map(String::from))
.unwrap_or_else(|| "main".to_string());
load_readme(repo, "test-repo", &branch)
}
/// Build an empty git repo with a single commit containing the given
/// (path, contents) blobs at the root tree. Returns the repo and the
/// tempdir (kept alive by the caller).
fn repo_with_files(files: &[(&str, &[u8])]) -> (Repository, TempDir) {
let tmp = TempDir::new().unwrap();
let repo = Repository::init(tmp.path()).unwrap();
let sig = git2::Signature::now("Test", "test@example.com").unwrap();
{
let tree_oid = {
let mut builder = repo.treebuilder(None).unwrap();
for (name, contents) in files {
let oid = repo.blob(contents).unwrap();
builder.insert(name, oid, 0o100644).unwrap();
}
builder.write().unwrap()
};
let tree = repo.find_tree(tree_oid).unwrap();
repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[]).unwrap();
}
(repo, tmp)
}
#[test]
fn no_readme_returns_none() {
let (repo, _tmp) = repo_with_files(&[("lib.rs", b"fn main() {}")]);
assert!(load(&repo).is_none());
}
#[test]
fn finds_uppercase_readme_md() {
let (repo, _tmp) = repo_with_files(&[("README.md", b"# Title\n\nbody\n")]);
let r = load(&repo).expect("README found");
assert!(r.html.contains("<h1>Title</h1>"), "got: {}", r.html);
}
#[test]
fn finds_mixed_case_readme_md() {
let (repo, _tmp) = repo_with_files(&[("Readme.MD", b"# T\n")]);
assert!(load(&repo).is_some());
}
#[test]
fn md_wins_over_txt() {
let (repo, _tmp) = repo_with_files(&[
("README.md", b"# md\n"),
("README.txt", b"plain"),
]);
let r = load(&repo).unwrap();
assert!(r.html.contains("<h1>md</h1>"));
assert!(!r.html.contains("plain"));
}
#[test]
fn readme_wins_over_txt() {
let (repo, _tmp) = repo_with_files(&[
("README", b"plain readme"),
("README.txt", b"plain txt"),
]);
let r = load(&repo).unwrap();
assert!(r.html.contains("plain readme"));
assert!(!r.html.contains("plain txt"));
}
#[test]
fn mixed_case_md_still_wins_over_lowercase_txt() {
let (repo, _tmp) = repo_with_files(&[
("README.md", b"# md\n"),
("readme.txt", b"plain"),
]);
assert!(load(&repo).unwrap().html.contains("<h1>md</h1>"));
}
#[test]
fn nested_readme_is_not_matched() {
// Build a tree containing both:
// docs/README.md (a subtree entry that must be ignored)
// README (a real root-level README that must win)
// This proves `find_readme_entry` does not recurse into subtrees,
// and that subtree entries with README-like names are skipped at the
// root iteration level.
let tmp = TempDir::new().unwrap();
let repo = Repository::init(tmp.path()).unwrap();
let sig = git2::Signature::now("T", "t@e").unwrap();
// The nested blob whose name happens to match a README.
let nested_blob = repo.blob(b"# nested wins (it should not)\n").unwrap();
let mut sub = repo.treebuilder(None).unwrap();
sub.insert("README.md", nested_blob, 0o100644).unwrap();
let sub_oid = sub.write().unwrap();
// The real root README.
let root_blob = repo.blob(b"actual root readme\n").unwrap();
let mut root = repo.treebuilder(None).unwrap();
root.insert("docs", sub_oid, 0o040000).unwrap();
root.insert("README", root_blob, 0o100644).unwrap();
let root_oid = root.write().unwrap();
let tree = repo.find_tree(root_oid).unwrap();
repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[]).unwrap();
let r = load(&repo).expect("root README must be found");
assert!(r.html.contains("actual root readme"));
assert!(!r.html.contains("nested wins"));
}
#[test]
fn plain_readme_escapes_script_tag() {
let (repo, _tmp) = repo_with_files(&[
("README", b"<script>alert(1)</script>\nhello"),
]);
let r = load(&repo).unwrap();
assert!(r.html.starts_with("<pre>"));
assert!(r.html.contains("<script>"));
assert!(!r.html.contains("<script>"));
assert!(r.html.contains("hello"));
}
#[test]
fn symlink_readme_is_ignored() {
let tmp = TempDir::new().unwrap();
let repo = Repository::init(tmp.path()).unwrap();
let sig = git2::Signature::now("T", "t@e").unwrap();
// Symlinks are stored as a blob whose contents are the target path,
// with file mode 0o120000.
let target_blob = repo.blob(b"docs/REAL.md").unwrap();
let mut root = repo.treebuilder(None).unwrap();
root.insert("README.md", target_blob, 0o120000).unwrap();
let root_oid = root.write().unwrap();
let tree = repo.find_tree(root_oid).unwrap();
repo.commit(Some("HEAD"), &sig, &sig, "init", &tree, &[]).unwrap();
assert!(load(&repo).is_none());
}
#[test]
fn markdown_strips_script_tag() {
// Passes because pulldown-cmark does not emit raw HTML without
// ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
let (repo, _tmp) = repo_with_files(&[
("README.md", b"# t\n\n<script>alert(1)</script>\n"),
]);
let html = load(&repo).unwrap().html;
assert!(!html.contains("<script>"), "got: {}", html);
}
#[test]
fn markdown_strips_javascript_href() {
// Passes because pulldown-cmark does not emit raw HTML without
// ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
let (repo, _tmp) = repo_with_files(&[
("README.md", b"[click](javascript:alert(1))\n"),
]);
let html = load(&repo).unwrap().html;
assert!(!html.contains("javascript:"), "got: {}", html);
}
#[test]
fn markdown_strips_onerror_attribute() {
// Passes because pulldown-cmark does not emit raw HTML without
// ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
let (repo, _tmp) = repo_with_files(&[
("README.md", b"<img src=\"https://x/y.png\" onerror=\"alert(1)\">\n"),
]);
let html = load(&repo).unwrap().html;
assert!(!html.contains("onerror"), "got: {}", html);
}
#[test]
fn markdown_strips_iframe() {
// Passes because pulldown-cmark does not emit raw HTML without
// ENABLE_UNSAFE_HTML — defense-in-depth canary, not an ammonia test.
let (repo, _tmp) = repo_with_files(&[
("README.md", b"<iframe src=\"https://evil.example/\"></iframe>\n"),
]);
let html = load(&repo).unwrap().html;
assert!(!html.contains("<iframe"), "got: {}", html);
}
#[test]
fn markdown_strips_data_image_uri() {
let (repo, _tmp) = repo_with_files(&[
("README.md", b"<img src=\"data:image/png;base64,AAAA\">\n"),
]);
let html = load(&repo).unwrap().html;
// Either the whole <img> is dropped or the src attr is gone.
assert!(!html.contains("data:"), "got: {}", html);
}
#[test]
fn empty_readme_renders_empty_body() {
let (repo, _tmp) = repo_with_files(&[("README.md", b"")]);
let r = load(&repo).expect("present-but-empty is Some");
// ammonia of empty markdown is the empty string; assert it's not the
// too-large notice and not None.
assert!(!r.html.contains("too large"));
}
#[test]
fn oversized_blob_returns_too_large_notice() {
let big = vec![b'x'; 600 * 1024];
let (repo, _tmp) = repo_with_files(&[("README.md", &big)]);
let r = load(&repo).unwrap();
assert!(r.html.contains("too large"));
assert!(r.html.contains("/blob/"));
assert!(r.html.contains("README.md"));
}
#[test]
fn markdown_bomb_post_render_cap_trips() {
// Code-span paragraphs: "`a`\n\n" (5 bytes each) expand to
// "<p><code>a</code></p>\n" (~22 bytes each) — a 4.4x ratio.
// At 100_000 reps: source = 500_000 bytes < 512 KiB (524_288),
// rendered ≈ 2_200_000 bytes > 2 MiB (2_097_152).
// Calibrated empirically: ratio confirmed at 4.40x.
let mut src = String::new();
for _ in 0..100_000usize {
src.push_str("`a`\n\n");
}
assert!(src.len() < MAX_BLOB_BYTES, "source {} >= blob cap {}", src.len(), MAX_BLOB_BYTES);
let (repo, _tmp) = repo_with_files(&[("README.md", src.as_bytes())]);
let r = load(&repo).unwrap();
assert!(r.html.contains("too large"), "expected bomb to trip cap");
}
#[test]
fn plain_text_post_render_cap_trips() {
// 500 KiB of '&' → ~2.5 MiB of "&" inside a <pre>.
let bytes = vec![b'&'; 500 * 1024];
let (repo, _tmp) = repo_with_files(&[("README", &bytes)]);
let r = load(&repo).unwrap();
assert!(r.html.contains("too large"), "expected plain-text bomb to trip cap");
}
#[test]
fn binary_blob_returns_none() {
let (repo, _tmp) = repo_with_files(&[("README.md", &[0u8, 1, 2, 3, 0xff, 0xfe])]);
assert!(load(&repo).is_none());
}
#[test]
fn invalid_utf8_returns_none() {
// Mostly valid text + a stray 0x80 byte. Not flagged as binary by git2's
// heuristic (no NULs), but not valid UTF-8 either.
let mut bytes = Vec::from(&b"hello world\nmore text\n"[..]);
bytes.push(0x80);
bytes.extend_from_slice(b"\nmore\n");
let (repo, _tmp) = repo_with_files(&[("README.md", &bytes)]);
assert!(load(&repo).is_none());
}
}