Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip HTML tags (but keep any text content) when rendering text #33

Merged
merged 16 commits into from Oct 2, 2021
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -7,6 +7,7 @@ require (
github.com/Masterminds/sprig/v3 v3.2.2
github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a
github.com/google/uuid v1.3.0 // indirect
github.com/grokify/html-strip-tags-go v0.0.1
github.com/huandu/xstrings v1.3.2 // indirect
github.com/imdario/mergo v0.3.12 // indirect
github.com/mattn/go-runewidth v0.0.13 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Expand Up @@ -21,6 +21,8 @@ github.com/gomarkdown/markdown v0.0.0-20210915032930-fe0e174ee09a/go.mod h1:JDGc
github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/grokify/html-strip-tags-go v0.0.1 h1:0fThFwLbW7P/kOiTBs03FsJSV9RM2M/Q/MOnCQxKMo0=
github.com/grokify/html-strip-tags-go v0.0.1/go.mod h1:2Su6romC5/1VXOQMaWL2yb618ARB8iVo6/DR99A6d78=
github.com/huandu/xstrings v1.3.1/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/huandu/xstrings v1.3.2 h1:L18LIDzqlW6xN2rEkpdV8+oL/IXWJ1APd+vsdYy4Wdw=
github.com/huandu/xstrings v1.3.2/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
Expand Down
51 changes: 46 additions & 5 deletions internal/renderer/renderer.go
Expand Up @@ -22,9 +22,9 @@ import (
"fmt"
"io"
"regexp"
"strings"

"github.com/gomarkdown/markdown/ast"
"github.com/grokify/html-strip-tags-go"
"github.com/olekukonko/tablewriter"
)

Expand All @@ -51,6 +51,21 @@ var (
// matches a FULL string that contains no non-whitespace characters
var emptyLineRegex = regexp.MustCompile(`\A[\s]*\z`)

// fairly tolerant to handle weird HTML
var tagPairRegexString = `<[\n\f ]*%s([\n\f ]+[^\n\f \/>"'=]+[\n\f ]*(=[\n\f ]*([a-zA-Z1-9\-]+|"[^\n\f"]+"|'[^\n\f']+'))?)*[\n\f ]*>.*?<[\n\f ]*/[\n\f ]*%s[\n\f ]*>`
tdemin marked this conversation as resolved.
Show resolved Hide resolved

// HTML block tags whose contents should not be rendered
var htmlNoRenderRegex = []*regexp.Regexp{
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "fieldset", "fieldset")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "form", "form")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "iframe", "iframe")),
tdemin marked this conversation as resolved.
Show resolved Hide resolved
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "script", "script")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "style", "style")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "canvas", "canvas")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "dialog", "dialog")),
regexp.MustCompile(fmt.Sprintf(tagPairRegexString, "progress", "progress")),
}

// Renderer implements markdown.Renderer.
type Renderer struct{}

Expand Down Expand Up @@ -125,7 +140,7 @@ func (r Renderer) subscript(w io.Writer, node *ast.Subscript, entering bool) {
if entering {
if node := node.AsLeaf(); node != nil {
w.Write(subOpen)
w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
w.Write(subClose)
}
}
Expand All @@ -134,7 +149,7 @@ func (r Renderer) superscript(w io.Writer, node *ast.Superscript, entering bool)
if entering {
if node := node.AsLeaf(); node != nil {
w.Write(supOpen)
w.Write([]byte(strings.ReplaceAll(string(node.Literal), "\n", " ")))
w.Write(bytes.ReplaceAll(node.Literal, lineBreak, space))
w.Write(supClose)
}
}
Expand Down Expand Up @@ -338,6 +353,7 @@ func (r Renderer) list(w io.Writer, node *ast.List, level int) {
}

var lineBreakCharacters = regexp.MustCompile(`[\n\r]+`)
var hardBreakTag = regexp.MustCompile(`< *br */? *>`)

func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
buf := bytes.Buffer{}
Expand All @@ -346,13 +362,20 @@ func textWithNewlineReplacement(node ast.Node, replacement []byte) []byte {
if node, ok := node.(*ast.Link); ok && node.Footnote != nil {
fmt.Fprintf(&buf, "[^%d]", node.NoteID)
}
if node := node.AsLeaf(); node != nil {
if leaf := node.AsLeaf(); leaf != nil {
// replace all newlines in text with preferred symbols; this may
// be spaces for general text, allowing for soft wrapping, which
// is recommended as per Gemini spec p. 5.4.1, or line breaks
// with a blockquote symbols for blockquotes, or just nothing
buf.Write(delimiter)
buf.Write(lineBreakCharacters.ReplaceAll(node.Literal, replacement))
switch node.(type) {
case *ast.Hardbreak:
tdemin marked this conversation as resolved.
Show resolved Hide resolved
buf.Write(lineBreak)
case *ast.HTMLSpan:
buf.Write(leaf.Content)
default:
buf.Write(lineBreakCharacters.ReplaceAll(leaf.Literal, replacement))
}
buf.Write(delimiter)
}
if node := node.AsContainer(); node != nil {
Expand Down Expand Up @@ -440,6 +463,22 @@ func (r Renderer) table(w io.Writer, node *ast.Table, entering bool) {
}
}

func (r Renderer) htmlBlock(w io.Writer, node *ast.HTMLBlock, entering bool) {
if entering {
// Only render contents of allowed tags
literal := node.Literal
for _, re := range htmlNoRenderRegex {
literal = re.ReplaceAllLiteral(literal, []byte(""))
}
if len(literal) > 0 {
literalWithBreaks := hardBreakTag.ReplaceAll(lineBreakCharacters.ReplaceAll(literal, space), lineBreak)
w.Write([]byte(strip.StripTags(string(literalWithBreaks))))
w.Write(lineBreak)
w.Write(lineBreak)
}
}
}

// RenderNode implements Renderer.RenderNode().
func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.WalkStatus {
// entering in gomarkdown was made to have elements of type switch
Expand Down Expand Up @@ -487,6 +526,8 @@ func (r Renderer) RenderNode(w io.Writer, node ast.Node, entering bool) ast.Walk
r.table(w, node, entering)
noNewLine = false
fetchLinks = true
case *ast.HTMLBlock:
mntn-xyz marked this conversation as resolved.
Show resolved Hide resolved
r.htmlBlock(w, node, entering)
}
if !noNewLine && !entering {
w.Write(lineBreak)
Expand Down