Browse Source

Improve `toc.sh` to support more of GitHub's edge cases

Most notably, this allows emoji in headers to generate the correct anchors.

See https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for a bunch of test cases I worked through this with (and found bugs in GitHub's implementation while reverse engineering it, which is fun, and quirks of the library I was copying that aren't correct for the GitHub implementation).

I've also verified that this causes no changes to the TOC here or over in the docs repo (as expected).
Tianon Gravi 7 months ago
parent
commit
190775c582
1 changed files with 62 additions and 51 deletions
  1. 62 51
      toc.sh

+ 62 - 51
toc.sh

@@ -4,71 +4,82 @@ set -Eeuo pipefail
 self="$(basename "$0")"
 usage() {
 	cat <<-EOU
-		usage: $self path/to/README.md
+		usage: $self path/to/markdown.md
 		   eg: $self README.md
 
-		WARNING: if README.md has the TOC-replacement comments,
-		         README.md.bak will be clobbered and the TOC will be inserted
+		WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
 	EOU
 }
 
 markdown="${1:-}"
-if ! shift || [ ! -f "$markdown" ]; then usage >&2; exit 1; fi
+if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
 
-toc="$(
-	gawk '
-		# ignore comments in code blocks, which are not headers but look like them
-		/^```/ { ignore = !ignore }
+# see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
 
-		/^#/ && !ignore {
-			level = length($1)
-			$1 = ""
-			gsub(/^[[:space:]]|[[:space:]]$/, "")
+jq --raw-input --null-input --raw-output '
+	reduce inputs as $line ({ toc: "" };
+		if $line | test("^```") then
+			.ignore |= not
+		else . end
+		| if .ignore then . else
+			(
+				$line
+				| capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
+				// null
+			) as $cap
+			| if $cap then
+				($cap.hash | length) as $level
+				| .levels[$level] += 1
+				| .levels |= (.[range($level+1; length)] = 0)
+				| (
+					$cap.heading
+					| ascii_downcase
+					# https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
+					| gsub(" "; "-")
+					# escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
+					#| gsub("%[abcdef0-9]{2}"; ""; "i")
+					| gsub("%"; "")
+					# single chars that are removed
+					| gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
+					# CJK punctuations that are removed
+					| gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
+					# Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
+					| (split("") | map(select(utf8bytelength == 1)) | join(""))
+					# TODO Strip embedded markdown formatting
+				) as $anchor
+				# handle repetition (same end anchor)
+				| (
+					(.seen // []) as $seen
+					| first(
+						# this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
+						$anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
+						| select(IN($seen[]) | not)
+					)
+					// error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
+				) as $finalAnchor
+				| .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
+				| .seen += [ $finalAnchor ]
+			else . end
+		end
+	)
+	| .toc
+' "$markdown" > "$markdown.toc"
 
-			++levelCounter[level]
-			for (i in levelCounter) {
-				if (i > level) {
-					levelCounter[i] = 0
-				}
-			}
-			prefix = levelCounter[level] ".\t"
-			for (i = 1; i < level; ++i) {
-				prefix = "\t" prefix
-			}
-
-			# https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
-			hash = tolower($0)
-			gsub(/ /, "-", hash)
-			gsub(/[\/?!:\[\]`.,()*"'"'"';{}+=<>~\$|#@&–—]/, "", hash)
-			gsub(/[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]/, "", hash)
-
-			printf "%s[%s](#%s)\n", prefix, $0, hash
-		}
-	' "$markdown"
-)"
-
-toFile="${markdown}.bak"
-gawk -v toFile="$toFile" -v toc="$toc" '
-	BEGIN { printf "" > toFile }
+gawk -v tocFile="$markdown.toc" '
 	/^<!-- AUTOGENERATED TOC -->$/ {
 		inToc = !inToc
 		seenToc = 1
 		if (inToc) {
-			print >> toFile
-			print "" >> toFile
-			print toc >> toFile
-			print "" >> toFile
-			print >> toFile
+			print
+			print ""
+			system("cat " tocFile)
+			# no need for another newline because tocFile should already end with one
+			print
 		}
 		next
 	}
-	!inToc { print >> toFile }
-	END { if (!seenToc) { close(toFile); printf "" > toFile } }
-' "$markdown"
+	!inToc { print }
+' "$markdown" > "$markdown.bak"
 
-if [ -s "$toFile" ]; then
-	mv "$toFile" "$markdown"
-else
-	rm "$toFile"
-	echo "$toc"
-fi
+mv -f "$markdown.bak" "$markdown"
+rm -f "$markdown.toc"