浏览代码

Improve `toc.sh` to support more of GitHub's edge cases

Most notably, this allows emoji in headers to generate the correct anchors.

See https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for a bunch of test cases I worked through this with (and found bugs in GitHub's implementation while reverse engineering it, which is fun, and quirks of the library I was copying that aren't correct for the GitHub implementation).

I've also verified that this causes no changes to the TOC here or over in the docs repo (as expected).
Tianon Gravi 7 月之前
父节点
当前提交
190775c582
共有 1 个文件被更改,包括 62 次插入51 次删除
  1. 62 51
      toc.sh

+ 62 - 51
toc.sh

@@ -4,71 +4,82 @@ set -Eeuo pipefail
 self="$(basename "$0")"
 self="$(basename "$0")"
 usage() {
 usage() {
 	cat <<-EOU
 	cat <<-EOU
-		usage: $self path/to/README.md
+		usage: $self path/to/markdown.md
 		   eg: $self README.md
 		   eg: $self README.md
 
 
-		WARNING: if README.md has the TOC-replacement comments,
-		         README.md.bak will be clobbered and the TOC will be inserted
+		WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
 	EOU
 	EOU
 }
 }
 
 
 markdown="${1:-}"
 markdown="${1:-}"
-if ! shift || [ ! -f "$markdown" ]; then usage >&2; exit 1; fi
+if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
 
 
-toc="$(
-	gawk '
-		# ignore comments in code blocks, which are not headers but look like them
-		/^```/ { ignore = !ignore }
+# see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
 
 
-		/^#/ && !ignore {
-			level = length($1)
-			$1 = ""
-			gsub(/^[[:space:]]|[[:space:]]$/, "")
+jq --raw-input --null-input --raw-output '
+	reduce inputs as $line ({ toc: "" };
+		if $line | test("^```") then
+			.ignore |= not
+		else . end
+		| if .ignore then . else
+			(
+				$line
+				| capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
+				// null
+			) as $cap
+			| if $cap then
+				($cap.hash | length) as $level
+				| .levels[$level] += 1
+				| .levels |= (.[range($level+1; length)] = 0)
+				| (
+					$cap.heading
+					| ascii_downcase
+					# https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
+					| gsub(" "; "-")
+					# escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
+					#| gsub("%[abcdef0-9]{2}"; ""; "i")
+					| gsub("%"; "")
+					# single chars that are removed
+					| gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
+					# CJK punctuations that are removed
+					| gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
+					# Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
+					| (split("") | map(select(utf8bytelength == 1)) | join(""))
+					# TODO Strip embedded markdown formatting
+				) as $anchor
+				# handle repetition (same end anchor)
+				| (
+					(.seen // []) as $seen
+					| first(
+						# this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
+						$anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
+						| select(IN($seen[]) | not)
+					)
+					// error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
+				) as $finalAnchor
+				| .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
+				| .seen += [ $finalAnchor ]
+			else . end
+		end
+	)
+	| .toc
+' "$markdown" > "$markdown.toc"
 
 
-			++levelCounter[level]
-			for (i in levelCounter) {
-				if (i > level) {
-					levelCounter[i] = 0
-				}
-			}
-			prefix = levelCounter[level] ".\t"
-			for (i = 1; i < level; ++i) {
-				prefix = "\t" prefix
-			}
-
-			# https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
-			hash = tolower($0)
-			gsub(/ /, "-", hash)
-			gsub(/[\/?!:\[\]`.,()*"'"'"';{}+=<>~\$|#@&–—]/, "", hash)
-			gsub(/[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]/, "", hash)
-
-			printf "%s[%s](#%s)\n", prefix, $0, hash
-		}
-	' "$markdown"
-)"
-
-toFile="${markdown}.bak"
-gawk -v toFile="$toFile" -v toc="$toc" '
-	BEGIN { printf "" > toFile }
+gawk -v tocFile="$markdown.toc" '
 	/^<!-- AUTOGENERATED TOC -->$/ {
 	/^<!-- AUTOGENERATED TOC -->$/ {
 		inToc = !inToc
 		inToc = !inToc
 		seenToc = 1
 		seenToc = 1
 		if (inToc) {
 		if (inToc) {
-			print >> toFile
-			print "" >> toFile
-			print toc >> toFile
-			print "" >> toFile
-			print >> toFile
+			print
+			print ""
+			system("cat " tocFile)
+			# no need for another newline because tocFile should already end with one
+			print
 		}
 		}
 		next
 		next
 	}
 	}
-	!inToc { print >> toFile }
-	END { if (!seenToc) { close(toFile); printf "" > toFile } }
-' "$markdown"
+	!inToc { print }
+' "$markdown" > "$markdown.bak"
 
 
-if [ -s "$toFile" ]; then
-	mv "$toFile" "$markdown"
-else
-	rm "$toFile"
-	echo "$toc"
-fi
+mv -f "$markdown.bak" "$markdown"
+rm -f "$markdown.toc"