From ddb3943359ce0b1a7a497f8de8112ccfdbc3ccb4 Mon Sep 17 00:00:00 2001
From: mntn <85877297+mntn-xyz@users.noreply.github.com>
Date: Sat, 2 Oct 2021 06:07:46 -0400
Subject: [PATCH] Strip HTML tags but keep content while rendering

This makes the renderer print the content of informational
HTML tags while stripping the tags themselves.

Tags like script, iframe, style, etc, which are unlikely to
ever hold presentable content, are exempt from this, and
their content is skipped from rendering as well as the tags
themselves.

<br>, a hard-break tag, is supported as a Markdown
hard-break replacement (the two spaces before newline).

This also adds tests for this behavior inside general_text.md.

Fixes #6, a longstanding issue with inline HTML in
blockquotes.
---
 go.mod                        |   1 +
 go.sum                        |   2 +
 internal/renderer/renderer.go | 117 ++++++++++++++++++++++++++++------
 testdata/general_text.gmi     |  49 ++++++++++++--
 testdata/general_text.md      |  73 +++++++++++++++++++--
 5 files changed, 213 insertions(+), 29 deletions(-)
diff --git a/go.mod b/go.mod
index 89b7e44..6b9ce25 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,7 @@ require (
 	github.com/Masterminds/sprig/v3 v3.2.2
 	github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a
 	github.com/google/uuid v1.3.0 // indirect
+	github.com/grokify/html-strip-tags-go v0.0.1
 	github.com/hexops/gotextdiff v1.0.3
 	github.com/huandu/xstrings v1.3.2 // indirect
 	github.com/imdario/mergo v0.3.12 // indirect
diff --git a/go.sum b/go.sum
index d3ede79..894b2f0 100644
--- a/go.sum
+++ b/go.sum
@@ -21,6 +21,8 @@ github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a/go.mod h1:JDGc
 github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
+github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
 github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
 github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
 github.com/huandu/xstrings v1.3.1/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
diff --git a/internal/renderer/renderer.go b/internal/renderer/renderer.go
index 9f35bf7..c864976 100644
--- a/internal/renderer/renderer.go
+++ b/internal/renderer/renderer.go
@@ -20,11 +20,12 @@ package renderer
 import (
 	"bytes"
 	"fmt"
+	"html"
 	"io"
 	"regexp"
-	"strings"
 
 	"github.com/gomarkdown/markdown/ast"
+	"github.com/grokify/html-strip-tags-go"
 	"github.com/olekukonko/tablewriter"
 )
 
@@ -51,6 +52,25 @@ var (
 // matches a FULL string that contains no non-whitespace characters
 var emptyLineRegex = regexp.MustCompile(`\A[\s]*\z`)
 
+// fairly tolerant to handle weird HTML
+var tagPairRegexString = `<[\n\f ]*%s([\n\f ]+[^\n\f \/>"'=]+[\n\f ]*(=[\n\f ]*([a-zA-Z1-9\-]+|"[^\n\f"]+"|'[^\n\f']+'))?)*[\n\f ]*>.*?<[\n\f ]*/[\n\f ]*%s[\n\f ]*>`
+
+// HTML block tags whose contents should not be rendered
+var htmlNoRenderRegex = []*regexp.Regexp{
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "fieldset", "fieldset")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "form", "form")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "iframe", "iframe")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "script", "script")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "style", "style")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "canvas", "canvas")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "dialog", "dialog")),
+	regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "progress", "progress")),
+}
+
+var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
+var hardBreakTag = regexp.MustCompile(`< *br */? *>`)
+var escapedHtmlChar = regexp.MustCompile(`(?:^|[^\\\\])&[[:alnum:]]+;`)
+
 // Renderer implements markdown.Renderer.
 type Renderer struct{}
 
@@ -82,7 +102,7 @@ func (r Renderer) link(w io.Writer, node *ast.Link, entering bool) {
 			w.Write(linkPrefix)
 			w.Write(node.Destination)
 			w.Write(space)
-			r.text(w, node)
+			r.text(w, node, true)
 		}
 	}
 }
@@ -92,7 +112,7 @@ func (r Renderer) image(w io.Writer, node *ast.Image, entering bool) {
 		w.Write(linkPrefix)
 		w.Write(node.Destination)
 		w.Write(space)
-		r.text(w, node)
+		r.text(w, node, true)
 	}
 }
 
@@ -125,7 +145,7 @@ func (r Renderer) subscript(w io.Writer, node *ast.Subscript, entering bool) {
 	if entering {
 		if node := node.AsLeaf(); node != nil {
 			w.Write(subOpen)
-			w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
+			w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
 			w.Write(subClose)
 		}
 	}
@@ -134,7 +154,7 @@ func (r Renderer) superscript(w io.Writer, node *ast.Superscript, entering bool)
 	if entering {
 		if node := node.AsLeaf(); node != nil {
 			w.Write(supOpen)
-			w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
+			w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
 			w.Write(supClose)
 		}
 	}
@@ -151,7 +171,7 @@ func (r Renderer) heading(w io.Writer, node *ast.Heading, entering bool) {
 			heading[i] = '#'
 		}
 		w.Write(heading)
-		r.text(w, node)
+		r.text(w, node, true)
 	} else {
 		w.Write(lineBreak)
 	}
@@ -277,8 +297,16 @@ func (r Renderer) paragraph(w io.Writer, node *ast.Paragraph, entering bool) (no
 				// only render links text in the paragraph if they're
 				// combined with some other text on page
 				switch child := child.(type) {
-				case *ast.Text, *ast.Code, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
-					r.text(w, child)
+				case *ast.Text, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
+					r.text(w, child, true)
+				case *ast.Code:
+					r.text(w, child, false)
+				case *ast.Hardbreak:
+					w.Write(lineBreak)
+				case *ast.HTMLSpan:
+					if hardBreakTag.Match(child.AsLeaf().Literal) {
+						w.Write(lineBreak)
+					}
 				case *ast.Subscript:
 					r.subscript(w, child, true)
 				case *ast.Superscript:
@@ -326,7 +354,7 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
 			} else if !isTerm {
 				w.Write(itemPrefix)
 			}
-			r.text(w, item)
+			r.text(w, item, true)
 			w.Write(lineBreak)
 			if l >= 2 {
 				if list, ok := item.Children[1].(*ast.List); ok {
@@ -337,22 +365,43 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
 	}
 }
 
-var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
-
-func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
+func textWithNewlineReplacement(node ast.Node, replacement []byte, unescapeHtml bool) []byte {
 	buf := bytes.Buffer{}
 	delimiter := getNodeDelimiter(node)
 	// special case for footnotes: we want them in the text
 	if node, ok := node.(*ast.Link); ok && node.Footnote != nil {
 		fmt.Fprintf(&buf, "[^%d]", node.NoteID)
 	}
-	if node := node.AsLeaf(); node != nil {
+	if leaf := node.AsLeaf(); leaf != nil {
 		// replace all newlines in text with preferred symbols; this may
 		// be spaces for general text, allowing for soft wrapping, which
 		// is recommended as per Gemini spec p. 5.4.1, or line breaks
 		// with a blockquote symbols for blockquotes, or just nothing
 		buf.Write(delimiter)
-		buf.Write(lineBreakCharacters.ReplaceAll(node.Literal, replacement))
+		switch node := node.(type) {
+		case *ast.Hardbreak:
+			buf.Write(lineBreak)
+			// If the blockquote ends with a double space, the parser will
+			// not create a Hardbreak at the end, so this works.
+			if _, ok := leaf.Parent.(*ast.BlockQuote); !ok {
+				buf.Write(quotePrefix)
+			}
+		case *ast.HTMLSpan:
+			if hardBreakTag.Match(leaf.Literal) {
+				buf.Write(lineBreak)
+			}
+			buf.Write(leaf.Content)
+		case *ast.HTMLBlock:
+			buf.Write([]byte(extractHtml(node, quotePrefix)))
+		default:
+			textWithoutBreaks := lineBreakCharacters.ReplaceAll(leaf.Literal, replacement)
+			if unescapeHtml {
+				unescapedText := escapedHtmlChar.ReplaceAll(textWithoutBreaks, []byte(html.UnescapeString(string(textWithoutBreaks))))
+				buf.Write(unescapedText)
+			} else {
+				buf.Write(textWithoutBreaks)
+			}
+		}
 		buf.Write(delimiter)
 	}
 	if node := node.AsContainer(); node != nil {
@@ -362,7 +411,7 @@ func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
 			switch child := child.(type) {
 			case *ast.List:
 			default:
-				buf.Write(textWithNewlineReplacement(child, replacement))
+				buf.Write(textWithNewlineReplacement(child, replacement, unescapeHtml))
 			}
 		}
 		buf.Write(delimiter)
@@ -370,16 +419,30 @@ func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
 	return buf.Bytes()
 }
 
-func (r Renderer) text(w io.Writer, node ast.Node) {
-	w.Write(textWithNewlineReplacement(node, space))
+func (r Renderer) text(w io.Writer, node ast.Node, unescapeHtml bool) {
+	w.Write(textWithNewlineReplacement(node, space, unescapeHtml))
 }
 
 func (r Renderer) blockquoteText(w io.Writer, node ast.Node) {
-	w.Write(textWithNewlineReplacement(node, quoteBrPrefix))
+	w.Write(textWithNewlineReplacement(node, quoteBrPrefix, true))
 }
 
 func extractText(node ast.Node) string {
-	return string(textWithNewlineReplacement(node, space))
+	return string(textWithNewlineReplacement(node, space, true))
+}
+
+func extractHtml(node *ast.HTMLBlock, linePrefix []byte) string {
+	// Only render contents of allowed tags
+	literal := node.Literal
+	for _, re := range htmlNoRenderRegex {
+		literal = re.ReplaceAllLiteral(literal, []byte{})
+	}
+	if len(literal) > 0 {
+		literalWithBreaks := hardBreakTag.ReplaceAll(lineBreakCharacters.ReplaceAll(literal, space), append([]byte(lineBreak), linePrefix...))
+		literalStripped := strip.StripTags(string(literalWithBreaks))
+		return html.UnescapeString(literalStripped)
+	}
+	return ""
 }
 
 func (r Renderer) tableHead(t *tablewriter.Table, node *ast.TableHeader) {
@@ -440,6 +503,17 @@ func (r Renderer) table(w io.Writer, node *ast.Table, entering bool) {
 	}
 }
 
+func (r Renderer) htmlBlock(w io.Writer, node *ast.HTMLBlock, entering bool) {
+	if entering {
+		htmlString := extractHtml(node, []byte{})
+		if len(htmlString) > 0 {
+			w.Write([]byte(htmlString))
+			w.Write(lineBreak)
+			w.Write(lineBreak)
+		}
+	}
+}
+
 // RenderNode implements Renderer.RenderNode().
 func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.WalkStatus {
 	// entering in gomarkdown was made to have elements of type switch
@@ -487,6 +561,11 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
 		r.table(w, node, entering)
 		noNewLine = false
 		fetchLinks = true
+	case *ast.HTMLBlock:
+		// Do not render if already rendered as part of a blockquote
+		if _, ok := node.Parent.(*ast.BlockQuote); !ok {
+			r.htmlBlock(w, node, entering)
+		}
 	}
 	if !noNewLine && !entering {
 		w.Write(lineBreak)
diff --git a/testdata/general_text.gmi b/testdata/general_text.gmi
index 326f879..d578655 100644
--- a/testdata/general_text.gmi
+++ b/testdata/general_text.gmi
@@ -6,6 +6,10 @@ Single newlines (like in this multi-line paragraph) will get replaced by a space
 
 Inline formatting bits (like this **bold** text, *emphasized* text, ~~strikethrough~~ text, `preformatted text`) are kept to make sure Gemini readers still have the stylistic context of your text.
 
+Adding two spaces at the end of a line will insert a hard
+break. You can also create a hard break using a backslash at the end
+of a line. Hard breaks at the end of a paragraph are ignored.
+
 ## Blockquotes
 
 Newlines in blockquote paragraphs, unlike usual paragraphs, aren't replaced with a space. This facilitates appending authorship information to the quote, or using blockquotes to write poems.
@@ -22,6 +26,9 @@ Newlines in blockquote paragraphs, unlike usual paragraphs, aren't replaced with
 
 > — also Timur Demin, in the process of writing this test file
 
+> Hard breaks are also supported in blockquotes,
+> for compatibility. Hard breaks at the end of a blockquote are ignored.
+
 ## Code
 
 gmnhg will use Gemtext preformatted blocks for that. Markdown alt-text for preformatted blocks is supported, and is used to render alt-text as specified by Gemini spec p. 5.4.3.
@@ -74,15 +81,45 @@ Since clients like Lagrange treat the fourth and the rest of #-s as heading cont
 
 ###### Heading 6
 
-## Misc
+## HTML
+
+Inline HTML is currently stripped, but HTML contents remain on-screen. This may change in the future. HTML tags can be escaped with \ as in <span></span> or enclosed with ``.
+
+HTML tags are stripped from HTML blocks. (Note that HTML blocks must begin and end with a supported HTML block tag, and must have blank lines before and after the block.)
+
+### Break tags
+
+Hard breaks
+using <br> are supported.
+
+Hard breaks using <br> are supported
+inside HTML blocks.
+
+### HTML entities
 
-Inline HTML is currently stripped, but HTML contents remain on-screen. This may change in the future.
+HTML escaped entities like & and < are unescaped, even when they show up inside an inline HTML section. Escaping them with a leading backslash is possible outside of HTML blocks: &amp;, &lt;. Any escaped characters inside a code span (such as `&lt; or &gt;`) will not be unescaped.
 
-> There's currently a bug in gmnhg which prevents it from
-> stripping HTML in certain scenarios. HTML is noticeably still present
-> inside <span>blockquotes</span>.
+HTML escaped entities like < and > are also unescaped inside HTML blocks. Backslash escapes have no effect: \&.
 
-=> https://github.com/tdemin/gmnhg/issues/6 bug in gmnhg
+### Forbidden tags
+
+Tags that are unable to output Gemini-compatible text are completely removed from the output.
+
+Note that the contents of "forbidden" tags will be rendered if they are placed inline, although the tags themselves will be stripped. Placing HTML block elements inline in this manner violates the spec of common Markdown flavors, but gmnhg handles it the best it can.
+
+### HTML in blockquotes
+
+> HTML spans are stripped from
+> inside blockquotes.
+
+> Non HTML block text before the block.
+> HTML blocks are stripped from inside blockquotes.
+> Non HTML block text after the block.
+
+> Standalone blockquoted HTML blocks
+> are also stripped of their tags.
+
+## Misc
 
 ---
 
diff --git a/testdata/general_text.md b/testdata/general_text.md
index f9af9b9..1f978ad 100644
--- a/testdata/general_text.md
+++ b/testdata/general_text.md
@@ -10,6 +10,10 @@ Inline formatting bits (like this **bold** text, _emphasized_ text,
 ~~strikethrough~~ text, `preformatted text`) are kept to make sure
 Gemini readers still have the stylistic context of your text.
 
+Adding two spaces at the end of a line will insert a hard  
+break. You can also create a hard break using a backslash at the end\
+of a line. Hard breaks at the end of a paragraph are ignored.  
+
 ## Blockquotes
 
 Newlines in blockquote paragraphs, unlike usual paragraphs, aren't
@@ -28,6 +32,9 @@ to the quote, or using blockquotes to write poems.
 >
 > — also Timur Demin, in the process of writing this test file
 
+> Hard breaks are also supported in blockquotes,  
+> for compatibility. Hard breaks at the end of a blockquote are ignored.  
+
 ## Code
 
 gmnhg will use Gemtext preformatted blocks for that. Markdown alt-text
@@ -86,15 +93,73 @@ your client handles that.
 
 ###### Heading 6
 
-## Misc
+## HTML
 
 Inline HTML is <span class="bold">currently</span> stripped, but HTML
-contents remain on-screen. This may change in the future.
+contents remain on-screen. This may change in the future. HTML tags
+can be escaped with \ as in \<span>\</span> or enclosed with \`\`.
+
+<p>HTML tags are stripped from HTML blocks. (Note that HTML blocks
+must begin and end with a <em>supported</em> HTML block tag, and must
+have blank lines before and after the block.)</p>
+
+### Break tags
+
+Hard breaks<br>using \<br> are supported. 
+
+<p>Hard breaks using &lt;br&gt; are supported<br>inside HTML blocks.</p>
+
+### HTML entities
+
+HTML escaped entities like &amp; and <span>&lt;</span> are unescaped,
+even when they show up inside an inline HTML section. Escaping
+them with a leading backslash is possible outside of HTML blocks:
+\&amp;, \&lt;. Any escaped characters inside a code span (such as `&lt;
+or &gt;`) will not be unescaped.
+
+<p>HTML escaped entities like &lt; and &gt; are also unescaped
+inside HTML blocks. Backslash escapes have no effect: \&amp;.</p>
+
+### Forbidden tags
+
+Tags that are unable to output Gemini-compatible text are completely
+removed from the output.
+
+<fieldset>Fieldset blocks are not rendered.</fieldset>
 
-> There's currently a [bug in gmnhg][bug] which prevents it from
-> stripping HTML in certain scenarios. HTML is noticeably still present
+<form>Form blocks are not rendered.</form>
+
+<iframe>Iframe blocks are not rendered.</iframe>
+
+<script>Script blocks are not rendered.</script>
+
+<style>Style blocks are not rendered.</style>
+
+<canvas>Canvas blocks are not rendered.</canvas>
+
+<dialog>Dialog blocks are not rendered.</dialog>
+
+<progress>Progress blocks are not rendered.</progress>
+
+Note that the contents of "forbidden" tags will be rendered if they are
+placed <script>inline</script>, although the tags themselves will be
+stripped. Placing HTML block elements inline in this manner violates
+the spec of common Markdown flavors, but gmnhg handles it the best it
+can.
+
+### HTML in blockquotes
+
+> HTML spans are <em>stripped</em> from
 > inside <span>blockquotes</span>.
 
+> Non HTML block text before the block.
+> <p>HTML blocks are stripped from inside blockquotes.</p>
+> Non HTML block text after the block.
+
+> <p>Standalone blockquoted HTML blocks<br>are also stripped of their tags.</p>
+
+## Misc
+
 ***
 
 The Markdown horizontal line above is rendered as triple dashes.