7 月之前 · 190775c582
--- a/toc.sh
+++ b/toc.sh
@@ -4,71 +4,82 @@ set -Eeuo pipefail
 
															 self="$(basename "$0")"
														
 
															 usage() {
														
 
															 	cat <<-EOU
														
 
															-		usage: $self path/to/README.md
														
 
															+		usage: $self path/to/markdown.md
														
 
															 		   eg: $self README.md
														
 
															-		WARNING: if README.md has the TOC-replacement comments,
														
 
															-		         README.md.bak will be clobbered and the TOC will be inserted
														
 
															+		WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
														
 
															 	EOU
														
 
															 }
														
 
															 markdown="${1:-}"
														
 
															-if ! shift || [ ! -f "$markdown" ]; then usage >&2; exit 1; fi
														
 
															+if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
														
 
															-toc="$(
														
 
															-	gawk '
														
 
															-		# ignore comments in code blocks, which are not headers but look like them
														
 
															-		/^```/ { ignore = !ignore }
														
 
															+# see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
														
 
															-		/^#/ && !ignore {
														
 
															-			level = length($1)
														
 
															-			$1 = ""
														
 
															-			gsub(/^[[:space:]]|[[:space:]]$/, "")
														
 
															+jq --raw-input --null-input --raw-output '
														
 
															+	reduce inputs as $line ({ toc: "" };
														
 
															+		if $line | test("^```") then
														
 
															+			.ignore |= not
														
 
															+		else . end
														
 
															+		| if .ignore then . else
														
 
															+			(
														
 
															+				$line
														
 
															+				| capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
														
 
															+				// null
														
 
															+			) as $cap
														
 
															+			| if $cap then
														
 
															+				($cap.hash | length) as $level
														
 
															+				| .levels[$level] += 1
														
 
															+				| .levels |= (.[range($level+1; length)] = 0)
														
 
															+				| (
														
 
															+					$cap.heading
														
 
															+					| ascii_downcase
														
 
															+					# https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
														
 
															+					| gsub(" "; "-")
														
 
															+					# escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
														
 
															+					#| gsub("%[abcdef0-9]{2}"; ""; "i")
														
 
															+					| gsub("%"; "")
														
 
															+					# single chars that are removed
														
 
															+					| gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
														
 
															+					# CJK punctuations that are removed
														
 
															+					| gsub("[。？！，、；：“”【】（）〔〕［］﹃﹄“ ”‘’﹁﹂—…－～《》〈〉「」]"; "")
														
 
															+					# Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
														
 
															+					| (split("") | map(select(utf8bytelength == 1)) | join(""))
														
 
															+					# TODO Strip embedded markdown formatting
														
 
															+				) as $anchor
														
 
															+				# handle repetition (same end anchor)
														
 
															+				| (
														
 
															+					(.seen // []) as $seen
														
 
															+					| first(
														
 
															+						# this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
														
 
															+						$anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
														
 
															+						| select(IN($seen[]) | not)
														
 
															+					)
														
 
															+					// error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
														
 
															+				) as $finalAnchor
														
 
															+				| .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
														
 
															+				| .seen += [ $finalAnchor ]
														
 
															+			else . end
														
 
															+		end
														
 
															+	)
														
 
															+	| .toc
														
 
															+' "$markdown" > "$markdown.toc"
														
 
															-			++levelCounter[level]
														
 
															-			for (i in levelCounter) {
														
 
															-				if (i > level) {
														
 
															-					levelCounter[i] = 0
														
 
															-				}
														
 
															-			}
														
 
															-			prefix = levelCounter[level] ".\t"
														
 
															-			for (i = 1; i < level; ++i) {
														
 
															-				prefix = "\t" prefix
														
 
															-			}
														
 
															-
														
 
															-			# https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
														
 
															-			hash = tolower($0)
														
 
															-			gsub(/ /, "-", hash)
														
 
															-			gsub(/[\/?!:\[\]`.,()*"'"'"';{}+=<>~\$|#@&–—]/, "", hash)
														
 
															-			gsub(/[。？！，、；：“”【】（）〔〕［］﹃﹄“ ”‘’﹁﹂—…－～《》〈〉「」]/, "", hash)
														
 
															-
														
 
															-			printf "%s[%s](#%s)\n", prefix, $0, hash
														
 
															-		}
														
 
															-	' "$markdown"
														
 
															-)"
														
 
															-
														
 
															-toFile="${markdown}.bak"
														
 
															-gawk -v toFile="$toFile" -v toc="$toc" '
														
 
															-	BEGIN { printf "" > toFile }
														
 
															+gawk -v tocFile="$markdown.toc" '
														
 
															 	/^<!-- AUTOGENERATED TOC -->$/ {
														
 
															 		inToc = !inToc
														
 
															 		seenToc = 1
														
 
															 		if (inToc) {
														
 
															-			print >> toFile
														
 
															-			print "" >> toFile
														
 
															-			print toc >> toFile
														
 
															-			print "" >> toFile
														
 
															-			print >> toFile
														
 
															+			print
														
 
															+			print ""
														
 
															+			system("cat " tocFile)
														
 
															+			# no need for another newline because tocFile should already end with one
														
 
															+			print
														
 
															 		}
														
 
															 		next
														
 
															 	}
														
 
															-	!inToc { print >> toFile }
														
 
															-	END { if (!seenToc) { close(toFile); printf "" > toFile } }
														
 
															-' "$markdown"
														
 
															+	!inToc { print }
														
 
															+' "$markdown" > "$markdown.bak"
														
 
															-if [ -s "$toFile" ]; then
														
 
															-	mv "$toFile" "$markdown"
														
 
															-else
														
 
															-	rm "$toFile"
														
 
															-	echo "$toc"
														
 
															-fi
														
 
															+mv -f "$markdown.bak" "$markdown"
														
 
															+rm -f "$markdown.toc"