toc.sh 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #!/usr/bin/env bash
  2. set -Eeuo pipefail
  3. self="$(basename "$0")"
  4. usage() {
  5. cat <<-EOU
  6. usage: $self path/to/markdown.md
  7. eg: $self README.md
  8. WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
  9. EOU
  10. }
  11. markdown="${1:-}"
  12. if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
  13. # see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
  14. jq --raw-input --null-input --raw-output '
  15. reduce inputs as $line ({ toc: "" };
  16. if $line | test("^```") then
  17. .ignore |= not
  18. else . end
  19. | if .ignore then . else
  20. (
  21. $line
  22. | capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
  23. // null
  24. ) as $cap
  25. | if $cap then
  26. ($cap.hash | length) as $level
  27. | .levels[$level] += 1
  28. | .levels |= (.[range($level+1; length)] = 0)
  29. | (
  30. $cap.heading
  31. | ascii_downcase
  32. # https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
  33. | gsub(" "; "-")
  34. # escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
  35. #| gsub("%[abcdef0-9]{2}"; ""; "i")
  36. | gsub("%"; "")
  37. # single chars that are removed
  38. | gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
  39. # CJK punctuations that are removed
  40. | gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
  41. # Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
  42. | (split("") | map(select(utf8bytelength == 1)) | join(""))
  43. # TODO Strip embedded markdown formatting
  44. ) as $anchor
  45. # handle repetition (same end anchor)
  46. | (
  47. (.seen // []) as $seen
  48. | first(
  49. # this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
  50. $anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
  51. | select(IN($seen[]) | not)
  52. )
  53. // error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
  54. ) as $finalAnchor
  55. | .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
  56. | .seen += [ $finalAnchor ]
  57. else . end
  58. end
  59. )
  60. | .toc
  61. ' "$markdown" > "$markdown.toc"
  62. gawk -v tocFile="$markdown.toc" '
  63. /^<!-- AUTOGENERATED TOC -->$/ {
  64. inToc = !inToc
  65. seenToc = 1
  66. if (inToc) {
  67. print
  68. print ""
  69. system("cat " tocFile)
  70. # no need for another newline because tocFile should already end with one
  71. print
  72. }
  73. next
  74. }
  75. !inToc { print }
  76. ' "$markdown" > "$markdown.bak"
  77. mv -f "$markdown.bak" "$markdown"
  78. rm -f "$markdown.toc"