|
@@ -4,71 +4,82 @@ set -Eeuo pipefail
|
|
|
self="$(basename "$0")"
|
|
|
usage() {
|
|
|
cat <<-EOU
|
|
|
- usage: $self path/to/README.md
|
|
|
+ usage: $self path/to/markdown.md
|
|
|
eg: $self README.md
|
|
|
|
|
|
- WARNING: if README.md has the TOC-replacement comments,
|
|
|
- README.md.bak will be clobbered and the TOC will be inserted
|
|
|
+ WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
|
|
|
EOU
|
|
|
}
|
|
|
|
|
|
markdown="${1:-}"
|
|
|
-if ! shift || [ ! -f "$markdown" ]; then usage >&2; exit 1; fi
|
|
|
+if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
|
|
|
|
|
|
-toc="$(
|
|
|
- gawk '
|
|
|
- # ignore comments in code blocks, which are not headers but look like them
|
|
|
- /^```/ { ignore = !ignore }
|
|
|
+# see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
|
|
|
|
|
|
- /^#/ && !ignore {
|
|
|
- level = length($1)
|
|
|
- $1 = ""
|
|
|
- gsub(/^[[:space:]]|[[:space:]]$/, "")
|
|
|
+jq --raw-input --null-input --raw-output '
|
|
|
+ reduce inputs as $line ({ toc: "" };
|
|
|
+ if $line | test("^```") then
|
|
|
+ .ignore |= not
|
|
|
+ else . end
|
|
|
+ | if .ignore then . else
|
|
|
+ (
|
|
|
+ $line
|
|
|
+ | capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
|
|
|
+ // null
|
|
|
+ ) as $cap
|
|
|
+ | if $cap then
|
|
|
+ ($cap.hash | length) as $level
|
|
|
+ | .levels[$level] += 1
|
|
|
+ | .levels |= (.[range($level+1; length)] = 0)
|
|
|
+ | (
|
|
|
+ $cap.heading
|
|
|
+ | ascii_downcase
|
|
|
+ # https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
|
|
|
+ | gsub(" "; "-")
|
|
|
+ # escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
|
|
|
+ #| gsub("%[abcdef0-9]{2}"; ""; "i")
|
|
|
+ | gsub("%"; "")
|
|
|
+ # single chars that are removed
|
|
|
+ | gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
|
|
|
+ # CJK punctuations that are removed
|
|
|
+ | gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
|
|
|
+ # Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
|
|
|
+ | (split("") | map(select(utf8bytelength == 1)) | join(""))
|
|
|
+ # TODO Strip embedded markdown formatting
|
|
|
+ ) as $anchor
|
|
|
+ # handle repetition (same end anchor)
|
|
|
+ | (
|
|
|
+ (.seen // []) as $seen
|
|
|
+ | first(
|
|
|
+ # this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
|
|
|
+ $anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
|
|
|
+ | select(IN($seen[]) | not)
|
|
|
+ )
|
|
|
+ // error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
|
|
|
+ ) as $finalAnchor
|
|
|
+ | .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
|
|
|
+ | .seen += [ $finalAnchor ]
|
|
|
+ else . end
|
|
|
+ end
|
|
|
+ )
|
|
|
+ | .toc
|
|
|
+' "$markdown" > "$markdown.toc"
|
|
|
|
|
|
- ++levelCounter[level]
|
|
|
- for (i in levelCounter) {
|
|
|
- if (i > level) {
|
|
|
- levelCounter[i] = 0
|
|
|
- }
|
|
|
- }
|
|
|
- prefix = levelCounter[level] ".\t"
|
|
|
- for (i = 1; i < level; ++i) {
|
|
|
- prefix = "\t" prefix
|
|
|
- }
|
|
|
-
|
|
|
- # https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
|
|
|
- hash = tolower($0)
|
|
|
- gsub(/ /, "-", hash)
|
|
|
- gsub(/[\/?!:\[\]`.,()*"'"'"';{}+=<>~\$|#@&–—]/, "", hash)
|
|
|
- gsub(/[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]/, "", hash)
|
|
|
-
|
|
|
- printf "%s[%s](#%s)\n", prefix, $0, hash
|
|
|
- }
|
|
|
- ' "$markdown"
|
|
|
-)"
|
|
|
-
|
|
|
-toFile="${markdown}.bak"
|
|
|
-gawk -v toFile="$toFile" -v toc="$toc" '
|
|
|
- BEGIN { printf "" > toFile }
|
|
|
+gawk -v tocFile="$markdown.toc" '
|
|
|
/^<!-- AUTOGENERATED TOC -->$/ {
|
|
|
inToc = !inToc
|
|
|
seenToc = 1
|
|
|
if (inToc) {
|
|
|
- print >> toFile
|
|
|
- print "" >> toFile
|
|
|
- print toc >> toFile
|
|
|
- print "" >> toFile
|
|
|
- print >> toFile
|
|
|
+ print
|
|
|
+ print ""
|
|
|
+ system("cat " tocFile)
|
|
|
+ # no need for another newline because tocFile should already end with one
|
|
|
+ print
|
|
|
}
|
|
|
next
|
|
|
}
|
|
|
- !inToc { print >> toFile }
|
|
|
- END { if (!seenToc) { close(toFile); printf "" > toFile } }
|
|
|
-' "$markdown"
|
|
|
+ !inToc { print }
|
|
|
+' "$markdown" > "$markdown.bak"
|
|
|
|
|
|
-if [ -s "$toFile" ]; then
|
|
|
- mv "$toFile" "$markdown"
|
|
|
-else
|
|
|
- rm "$toFile"
|
|
|
- echo "$toc"
|
|
|
-fi
|
|
|
+mv -f "$markdown.bak" "$markdown"
|
|
|
+rm -f "$markdown.toc"
|