Implement a generic links & text extractor

Before this, links would only be scraped from paragraphs and rendered as a block after parent paragraph. This replaces this logic with a generic links extractor that would recursively collect every link from any parent node, including footnotes, blockquotes, and lists. The renderer splits links block in three separate blocks, one containing footnotes, the second containing images, and the third links. Blocks are separated by a single line break. This also makes the renderer skip link-only lists from rendering, making them behave like links-only paragraphs (which are already rendered as a series of consecutive links). The old behavior of extracting links from links-only paragraphs (where the paragraph itself is treated as just a set of links) remains, and can be still used. The text extraction is also now unified across headings, paragraphs, table headings, blockquotes, and other elements where it was used. This fixes a number of bugs (like panicking on links inside headings) and shortcomings where renderer would render text differently based on container element type. Fixes #17 and #23.
tdemin · Sep 3, 2021 · bcc3d8b · bcc3d8b
1 parent 3a86f0f
commit bcc3d8b
Showing 1 changed file with 157 additions and 102 deletions.
diff --git a/internal/gemini/renderer.go b/internal/gemini/renderer.go
@@ -49,7 +49,8 @@ var (
 	supClose           = []byte(")")
 )
 
-var meaningfulCharsRegex = regexp.MustCompile(`\A[\s]+\z`)
+// matches a FULL string that contains no non-whitespace characters
+var emptyLineRegex = regexp.MustCompile(`\A[\s]*\z`)
 
 const timestampFormat = "2006-01-02 15:04"
 
@@ -178,88 +179,142 @@ func (r Renderer) heading(w io.Writer, node *ast.Heading, entering bool) {
 			heading[gemtextHeadingLevelLimit] = ' '
 		}
 		w.Write(heading)
-		for _, text := range node.Children {
-			w.Write(text.AsLeaf().Literal)
-		}
+		r.text(w, node)
 	} else {
 		w.Write(lineBreak)
 	}
 }
 
+func extractLinks(node ast.Node) (stack []ast.Node) {
+	if node := node.AsContainer(); node != nil {
+		for _, subnode := range node.Children {
+			stack = append(stack, extractLinks(subnode)...)
+		}
+	}
+	switch node := node.(type) {
+	case *ast.Image:
+		stack = append(stack, node)
+	case *ast.Link:
+		stack = append(stack, node)
+		// footnotes are represented as links which embed an extra node
+		// containing footnote text; the link itself is not considered a
+		// container
+		if node.Footnote != nil {
+			stack = append(stack, extractLinks(node.Footnote)...)
+		}
+	}
+	return stack
+}
+
+func (r Renderer) renderLinks(w io.Writer, links []ast.Node) (count uint) {
+	for _, link := range links {
+		if link, ok := link.(*ast.Link); ok && link.Footnote == nil {
+			r.link(w, link, true)
+			w.Write(lineBreak)
+			count++
+		}
+	}
+	return
+}
+
+func (r Renderer) renderFootnotes(w io.Writer, links []ast.Node) (count uint) {
+	for _, link := range links {
+		if link, ok := link.(*ast.Link); ok && link.Footnote != nil {
+			r.link(w, link, true)
+			w.Write(lineBreak)
+			count++
+		}
+	}
+	return
+}
+
+func (r Renderer) renderImages(w io.Writer, links []ast.Node) (count uint) {
+	for _, link := range links {
+		if link, ok := link.(*ast.Image); ok {
+			r.image(w, link, true)
+			w.Write(lineBreak)
+			count++
+		}
+	}
+	return
+}
+
+func (r Renderer) linksList(w io.Writer, links []ast.Node) {
+	for _, renderer := range []func(Renderer, io.Writer, []ast.Node) uint{
+		Renderer.renderFootnotes,
+		Renderer.renderImages,
+		Renderer.renderLinks,
+	} {
+		linksRendered := renderer(r, w, links)
+		// ensure breaks between link blocks of the same type
+		if linksRendered > 0 {
+			w.Write(lineBreak)
+		}
+	}
+}
+
+func isLinksOnlyParagraph(node *ast.Paragraph) bool {
+	for _, child := range node.Children {
+		switch child := child.(type) {
+		case *ast.Text:
+			if emptyLineRegex.Find(child.Literal) != nil {
+				continue
+			}
+		case *ast.Link, *ast.Image:
+			continue
+		}
+		return false
+	}
+	return true
+}
+
+func isLinksOnlyList(node *ast.List) bool {
+	for _, child := range node.Children {
+		child, ok := child.(*ast.ListItem)
+		if !ok {
+			return false // should never happen
+		}
+		for _, liChild := range child.Children {
+			liChild, ok := liChild.(*ast.Paragraph)
+			if !ok {
+				return false // sublist, etc
+			}
+			if !isLinksOnlyParagraph(liChild) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
 func (r Renderer) paragraph(w io.Writer, node *ast.Paragraph, entering bool) (noNewLine bool) {
+	linksOnly := isLinksOnlyParagraph(node)
+	noNewLine = linksOnly
 	if entering {
 		children := node.Children
-		linkStack := make([]ast.Node, 0, len(children))
 		// current version of gomarkdown/markdown finds an empty
 		// *ast.Text element before links/images, breaking the heuristic
 		if len(children) >= 2 {
-			firstChild := children[0]
-			_, elementIsText := firstChild.(*ast.Text)
-			asLeaf := firstChild.AsLeaf()
-			if elementIsText && asLeaf != nil && len(asLeaf.Literal) == 0 {
+			firstChild, elementIsText := children[0].(*ast.Text)
+			if elementIsText && len(firstChild.Literal) == 0 {
 				children = children[1:]
 			}
 		}
-		linksOnly := func() bool {
+		if !linksOnly {
 			for _, child := range children {
+				// only render links text in the paragraph if they're
+				// combined with some other text on page
 				switch child := child.(type) {
-				case *ast.Link, *ast.Image:
-					continue
-				case *ast.Text:
-					// any meaningful text?
-					if meaningfulCharsRegex.Find(child.Literal) == nil {
-						return false
-					}
-					continue
-				}
-				return false
-			}
-			return true
-		}()
-		noNewLine = linksOnly
-		for _, child := range children {
-			// only render links text in the paragraph if they're
-			// combined with some other text on page
-			switch child := child.(type) {
-			case *ast.Link, *ast.Image:
-				if !linksOnly {
+				case *ast.Text, *ast.Code, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
 					r.text(w, child)
-				}
-				linkStack = append(linkStack, child)
-			case *ast.Text, *ast.Code, *ast.Emph, *ast.Strong, *ast.Del:
-				// the condition prevents text blocks consisting only of
-				// line breaks and spaces and such from rendering
-				if !linksOnly {
-					r.text(w, child)
-				}
-			case *ast.Subscript:
-				if !linksOnly {
+				case *ast.Subscript:
 					r.subscript(w, child, true)
-				}
-			case *ast.Superscript:
-				if !linksOnly {
+				case *ast.Superscript:
 					r.superscript(w, child, true)
 				}
 			}
-		}
-		if !linksOnly {
 			w.Write(lineBreak)
 		}
-		// render a links block after paragraph
-		if len(linkStack) > 0 {
-			if !linksOnly {
-				w.Write(lineBreak)
-			}
-			for _, link := range linkStack {
-				switch link := link.(type) {
-				case *ast.Link:
-					r.link(w, link, true)
-				case *ast.Image:
-					r.image(w, link, true)
-				}
-				w.Write(lineBreak)
-			}
-		}
 	}
 	return
 }
@@ -310,62 +365,49 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
 	}
 }
 
-func (r Renderer) text(w io.Writer, node ast.Node) {
+var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
+
+func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
+	buf := bytes.Buffer{}
 	delimiter := getNodeDelimiter(node)
 	// special case for footnotes: we want them in the text
 	if node, ok := node.(*ast.Link); ok && node.Footnote != nil {
-		fmt.Fprintf(w, "[^%d]", node.NoteID)
+		fmt.Fprintf(&buf, "[^%d]", node.NoteID)
 	}
 	if node := node.AsLeaf(); node != nil {
-		// replace all newlines in text with spaces, allowing for soft
-		// wrapping; this is recommended as per Gemini spec p. 5.4.1
-		w.Write(delimiter)
-		w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
-		w.Write(delimiter)
-		return
+		// replace all newlines in text with preferred symbols; this may
+		// be spaces for general text, allowing for soft wrapping, which
+		// is recommended as per Gemini spec p. 5.4.1, or line breaks
+		// with a blockquote symbols for blockquotes, or just nothing
+		buf.Write(delimiter)
+		buf.Write(lineBreakCharacters.ReplaceAll(node.Literal, replacement))
+		buf.Write(delimiter)
 	}
 	if node := node.AsContainer(); node != nil {
-		w.Write(delimiter)
+		buf.Write(delimiter)
 		for _, child := range node.Children {
-			r.text(w, child)
+			// skip non-text child elements from rendering
+			switch child := child.(type) {
+			case *ast.List:
+			default:
+				buf.Write(textWithNewlineReplacement(child, replacement))
+			}
 		}
-		w.Write(delimiter)
+		buf.Write(delimiter)
 	}
+	return buf.Bytes()
+}
+
+func (r Renderer) text(w io.Writer, node ast.Node) {
+	w.Write(textWithNewlineReplacement(node, space))
 }
 
-// TODO: this really should've been unified with text(), but having two
-// extra params for prefix/line breaks is not neat
 func (r Renderer) blockquoteText(w io.Writer, node ast.Node) {
-	delimiter := getNodeDelimiter(node)
-	if node := node.AsLeaf(); node != nil {
-		// pad every line break with blockquote symbol
-		w.Write(delimiter)
-		w.Write([]byte(bytes.ReplaceAll(node.Literal, lineBreak, quoteBrPrefix)))
-		w.Write(delimiter)
-		return
-	}
-	if node := node.AsContainer(); node != nil {
-		w.Write(delimiter)
-		for _, child := range node.Children {
-			r.blockquoteText(w, child)
-		}
-		w.Write(delimiter)
-	}
+	w.Write(textWithNewlineReplacement(node, quoteBrPrefix))
 }
 
 func extractText(node ast.Node) string {
-	delimiter := getNodeDelimiter(node)
-	if node := node.AsLeaf(); node != nil {
-		return string(delimiter) + strings.ReplaceAll(string(node.Literal), "\n", " ") + string(delimiter)
-	}
-	if node := node.AsContainer(); node != nil {
-		b := strings.Builder{}
-		for _, child := range node.Children {
-			b.WriteString(string(delimiter) + extractText(child) + string(delimiter))
-		}
-		return b.String()
-	}
-	panic("encountered a non-leaf & non-container node")
+	return string(textWithNewlineReplacement(node, space))
 }
 
 func (r Renderer) tableHead(t *tablewriter.Table, node *ast.TableHeader) {
@@ -435,9 +477,11 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
 	// container subroutines have to handle their subelements on
 	// themselves.
 	noNewLine := true
+	fetchLinks := false
 	switch node := node.(type) {
 	case *ast.BlockQuote:
 		r.blockquote(w, node, entering)
+		fetchLinks = true
 	case *ast.HorizontalRule:
 		r.hr(w, node, entering)
 	case *ast.Heading:
@@ -449,6 +493,7 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
 		case *ast.BlockQuote, *ast.ListItem, *ast.Footnotes:
 		default:
 			noNewLine = r.paragraph(w, node, entering)
+			fetchLinks = true
 		}
 	case *ast.CodeBlock:
 		r.code(w, node)
@@ -460,16 +505,26 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
 		_, parentIsDocument := node.Parent.(*ast.Document)
 		// footnotes are rendered as links after the parent paragraph
 		if !node.IsFootnotesList && parentIsDocument && !entering {
-			r.list(w, node, 0)
-			noNewLine = false
+			if !isLinksOnlyList(node) {
+				r.list(w, node, 0)
+				noNewLine = false
+			}
+			fetchLinks = true
 		}
 	case *ast.Table:
 		r.table(w, node, entering)
 		noNewLine = false
+		fetchLinks = true
 	}
 	if !noNewLine && !entering {
 		w.Write(lineBreak)
 	}
+	if fetchLinks && !entering {
+		links := extractLinks(node)
+		if len(links) > 0 {
+			r.linksList(w, links)
+		}
+	}
 	return ast.GoToNext
 }