Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip HTML tags (but keep any text content) when rendering text #33

Merged
merged 16 commits into from Oct 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -7,6 +7,7 @@ require (
github.com/Masterminds/sprig/v3 v3.2.2
github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a
github.com/google/uuid v1.3.0 // indirect
github.com/grokify/html-strip-tags-go v0.0.1
github.com/hexops/gotextdiff v1.0.3
github.com/huandu/xstrings v1.3.2 // indirect
github.com/imdario/mergo v0.3.12 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Expand Up @@ -21,6 +21,8 @@ github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a/go.mod h1:JDGc
github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
github.com/huandu/xstrings v1.3.1/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
Expand Down
117 changes: 98 additions & 19 deletions internal/renderer/renderer.go
Expand Up @@ -20,11 +20,12 @@ package renderer
import (
"bytes"
"fmt"
"html"
"io"
"regexp"
"strings"

"github.com/gomarkdown/markdown/ast"
"github.com/grokify/html-strip-tags-go"
"github.com/olekukonko/tablewriter"
)

Expand All @@ -51,6 +52,25 @@ var (
// matches a FULL string that contains no non-whitespace characters
var emptyLineRegex = regexp.MustCompile(`\A[\s]*\z`)

// fairly tolerant to handle weird HTML
var tagPairRegexString = `<[\n\f ]*%s([\n\f ]+[^\n\f \/>"'=]+[\n\f ]*(=[\n\f ]*([a-zA-Z1-9\-]+|"[^\n\f"]+"|'[^\n\f']+'))?)*[\n\f ]*>.*?<[\n\f ]*/[\n\f ]*%s[\n\f ]*>`
tdemin marked this conversation as resolved.
Show resolved Hide resolved

// HTML block tags whose contents should not be rendered
var htmlNoRenderRegex = []*regexp.Regexp{
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "fieldset", "fieldset")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "form", "form")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "iframe", "iframe")),
tdemin marked this conversation as resolved.
Show resolved Hide resolved
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "script", "script")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "style", "style")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "canvas", "canvas")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "dialog", "dialog")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "progress", "progress")),
}

var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
var hardBreakTag = regexp.MustCompile(`< *br */? *>`)
var escapedHtmlChar = regexp.MustCompile(`(?:^|[^\\\\])&[[:alnum:]]+;`)

// Renderer implements markdown.Renderer.
type Renderer struct{}

Expand Down Expand Up @@ -82,7 +102,7 @@ func (r Renderer) link(w io.Writer, node *ast.Link, entering bool) {
w.Write(linkPrefix)
w.Write(node.Destination)
w.Write(space)
r.text(w, node)
r.text(w, node, true)
}
}
}
Expand All @@ -92,7 +112,7 @@ func (r Renderer) image(w io.Writer, node *ast.Image, entering bool) {
w.Write(linkPrefix)
w.Write(node.Destination)
w.Write(space)
r.text(w, node)
r.text(w, node, true)
}
}

Expand Down Expand Up @@ -125,7 +145,7 @@ func (r Renderer) subscript(w io.Writer, node *ast.Subscript, entering bool) {
if entering {
if node := node.AsLeaf(); node != nil {
w.Write(subOpen)
w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
w.Write(subClose)
}
}
Expand All @@ -134,7 +154,7 @@ func (r Renderer) superscript(w io.Writer, node *ast.Superscript, entering bool)
if entering {
if node := node.AsLeaf(); node != nil {
w.Write(supOpen)
w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
w.Write(supClose)
}
}
Expand All @@ -151,7 +171,7 @@ func (r Renderer) heading(w io.Writer, node *ast.Heading, entering bool) {
heading[i] = '#'
}
w.Write(heading)
r.text(w, node)
r.text(w, node, true)
} else {
w.Write(lineBreak)
}
Expand Down Expand Up @@ -277,8 +297,16 @@ func (r Renderer) paragraph(w io.Writer, node *ast.Paragraph, entering bool) (no
// only render links text in the paragraph if they're
// combined with some other text on page
switch child := child.(type) {
case *ast.Text, *ast.Code, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
r.text(w, child)
case *ast.Text, *ast.Emph, *ast.Strong, *ast.Del, *ast.Link, *ast.Image:
r.text(w, child, true)
case *ast.Code:
r.text(w, child, false)
case *ast.Hardbreak:
w.Write(lineBreak)
case *ast.HTMLSpan:
if hardBreakTag.Match(child.AsLeaf().Literal) {
w.Write(lineBreak)
}
case *ast.Subscript:
r.subscript(w, child, true)
case *ast.Superscript:
Expand Down Expand Up @@ -326,7 +354,7 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
} else if !isTerm {
w.Write(itemPrefix)
}
r.text(w, item)
r.text(w, item, true)
w.Write(lineBreak)
if l >= 2 {
if list, ok := item.Children[1].(*ast.List); ok {
Expand All @@ -337,22 +365,43 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
}
}

var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)

func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
func textWithNewlineReplacement(node ast.Node, replacement []byte, unescapeHtml bool) []byte {
buf := bytes.Buffer{}
delimiter := getNodeDelimiter(node)
// special case for footnotes: we want them in the text
if node, ok := node.(*ast.Link); ok && node.Footnote != nil {
fmt.Fprintf(&buf, "[^%d]", node.NoteID)
}
if node := node.AsLeaf(); node != nil {
if leaf := node.AsLeaf(); leaf != nil {
// replace all newlines in text with preferred symbols; this may
// be spaces for general text, allowing for soft wrapping, which
// is recommended as per Gemini spec p. 5.4.1, or line breaks
// with a blockquote symbols for blockquotes, or just nothing
buf.Write(delimiter)
buf.Write(lineBreakCharacters.ReplaceAll(node.Literal, replacement))
switch node := node.(type) {
case *ast.Hardbreak:
tdemin marked this conversation as resolved.
Show resolved Hide resolved
buf.Write(lineBreak)
// If the blockquote ends with a double space, the parser will
// not create a Hardbreak at the end, so this works.
if _, ok := leaf.Parent.(*ast.BlockQuote); !ok {
buf.Write(quotePrefix)
}
case *ast.HTMLSpan:
if hardBreakTag.Match(leaf.Literal) {
buf.Write(lineBreak)
}
buf.Write(leaf.Content)
case *ast.HTMLBlock:
buf.Write([]byte(extractHtml(node, quotePrefix)))
default:
textWithoutBreaks := lineBreakCharacters.ReplaceAll(leaf.Literal, replacement)
if unescapeHtml {
unescapedText := escapedHtmlChar.ReplaceAll(textWithoutBreaks, []byte(html.UnescapeString(string(textWithoutBreaks))))
buf.Write(unescapedText)
} else {
buf.Write(textWithoutBreaks)
}
}
buf.Write(delimiter)
}
if node := node.AsContainer(); node != nil {
Expand All @@ -362,24 +411,38 @@ func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
switch child := child.(type) {
case *ast.List:
default:
buf.Write(textWithNewlineReplacement(child, replacement))
buf.Write(textWithNewlineReplacement(child, replacement, unescapeHtml))
}
}
buf.Write(delimiter)
}
return buf.Bytes()
}

func (r Renderer) text(w io.Writer, node ast.Node) {
w.Write(textWithNewlineReplacement(node, space))
func (r Renderer) text(w io.Writer, node ast.Node, unescapeHtml bool) {
w.Write(textWithNewlineReplacement(node, space, unescapeHtml))
}

func (r Renderer) blockquoteText(w io.Writer, node ast.Node) {
w.Write(textWithNewlineReplacement(node, quoteBrPrefix))
w.Write(textWithNewlineReplacement(node, quoteBrPrefix, true))
}

func extractText(node ast.Node) string {
return string(textWithNewlineReplacement(node, space))
return string(textWithNewlineReplacement(node, space, true))
}

func extractHtml(node *ast.HTMLBlock, linePrefix []byte) string {
// Only render contents of allowed tags
literal := node.Literal
for _, re := range htmlNoRenderRegex {
literal = re.ReplaceAllLiteral(literal, []byte{})
}
if len(literal) > 0 {
literalWithBreaks := hardBreakTag.ReplaceAll(lineBreakCharacters.ReplaceAll(literal, space), append([]byte(lineBreak), linePrefix...))
literalStripped := strip.StripTags(string(literalWithBreaks))
return html.UnescapeString(literalStripped)
}
return ""
}

func (r Renderer) tableHead(t *tablewriter.Table, node *ast.TableHeader) {
Expand Down Expand Up @@ -440,6 +503,17 @@ func (r Renderer) table(w io.Writer, node *ast.Table, entering bool) {
}
}

func (r Renderer) htmlBlock(w io.Writer, node *ast.HTMLBlock, entering bool) {
if entering {
htmlString := extractHtml(node, []byte{})
if len(htmlString) > 0 {
w.Write([]byte(htmlString))
w.Write(lineBreak)
w.Write(lineBreak)
}
}
}

// RenderNode implements Renderer.RenderNode().
func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.WalkStatus {
// entering in gomarkdown was made to have elements of type switch
Expand Down Expand Up @@ -487,6 +561,11 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
r.table(w, node, entering)
noNewLine = false
fetchLinks = true
case *ast.HTMLBlock:
mntn-xyz marked this conversation as resolved.
Show resolved Hide resolved
// Do not render if already rendered as part of a blockquote
if _, ok := node.Parent.(*ast.BlockQuote); !ok {
r.htmlBlock(w, node, entering)
}
}
if !noNewLine && !entering {
w.Write(lineBreak)
Expand Down
49 changes: 43 additions & 6 deletions testdata/general_text.gmi
Expand Up @@ -6,6 +6,10 @@ Single newlines (like in this multi-line paragraph) will get replaced by a space

Inline formatting bits (like this **bold** text, *emphasized* text, ~~strikethrough~~ text, `preformatted text`) are kept to make sure Gemini readers still have the stylistic context of your text.

Adding two spaces at the end of a line will insert a hard
break. You can also create a hard break using a backslash at the end
of a line. Hard breaks at the end of a paragraph are ignored.

## Blockquotes

Newlines in blockquote paragraphs, unlike usual paragraphs, aren't replaced with a space. This facilitates appending authorship information to the quote, or using blockquotes to write poems.
Expand All @@ -22,6 +26,9 @@ Newlines in blockquote paragraphs, unlike usual paragraphs, aren't replaced with

> — also Timur Demin, in the process of writing this test file

> Hard breaks are also supported in blockquotes,
> for compatibility. Hard breaks at the end of a blockquote are ignored.

## Code

gmnhg will use Gemtext preformatted blocks for that. Markdown alt-text for preformatted blocks is supported, and is used to render alt-text as specified by Gemini spec p. 5.4.3.
Expand Down Expand Up @@ -74,15 +81,45 @@ Since clients like Lagrange treat the fourth and the rest of #-s as heading cont

###### Heading 6

## Misc
## HTML

Inline HTML is currently stripped, but HTML contents remain on-screen. This may change in the future. HTML tags can be escaped with \ as in <span></span> or enclosed with ``.

HTML tags are stripped from HTML blocks. (Note that HTML blocks must begin and end with a supported HTML block tag, and must have blank lines before and after the block.)

### Break tags

Hard breaks
using <br> are supported.

Hard breaks using <br> are supported
inside HTML blocks.

### HTML entities

Inline HTML is currently stripped, but HTML contents remain on-screen. This may change in the future.
HTML escaped entities like & and < are unescaped, even when they show up inside an inline HTML section. Escaping them with a leading backslash is possible outside of HTML blocks: &amp;, &lt;. Any escaped characters inside a code span (such as `&lt; or &gt;`) will not be unescaped.

> There's currently a bug in gmnhg which prevents it from
> stripping HTML in certain scenarios. HTML is noticeably still present
> inside <span>blockquotes</span>.
HTML escaped entities like < and > are also unescaped inside HTML blocks. Backslash escapes have no effect: \&.

=> https://github.com/tdemin/gmnhg/issues/6 bug in gmnhg
### Forbidden tags

Tags that are unable to output Gemini-compatible text are completely removed from the output.

Note that the contents of "forbidden" tags will be rendered if they are placed inline, although the tags themselves will be stripped. Placing HTML block elements inline in this manner violates the spec of common Markdown flavors, but gmnhg handles it the best it can.

### HTML in blockquotes

> HTML spans are stripped from
> inside blockquotes.

> Non HTML block text before the block.
> HTML blocks are stripped from inside blockquotes.
> Non HTML block text after the block.

> Standalone blockquoted HTML blocks
> are also stripped of their tags.

## Misc

---

Expand Down