forgejo/modules/markup/mdstripper/mdstripper.go

200 lines
4.8 KiB
Go
Raw Normal View History

// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package mdstripper
import (
"bytes"
"io"
"net/url"
"strings"
"sync"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/markup/common"
"code.gitea.io/gitea/modules/setting"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/renderer"
"github.com/yuin/goldmark/renderer/html"
"github.com/yuin/goldmark/text"
)
var (
giteaHostInit sync.Once
giteaHost *url.URL
)
type stripRenderer struct {
localhost *url.URL
links []string
empty bool
}
func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error {
return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if !entering {
return ast.WalkContinue, nil
}
switch v := n.(type) {
case *ast.Text:
if !v.IsRaw() {
_, prevSibIsText := n.PreviousSibling().(*ast.Text)
coalesce := prevSibIsText
r.processString(
w,
v.Text(source),
coalesce)
if v.SoftLineBreak() {
r.doubleSpace(w)
}
}
return ast.WalkContinue, nil
case *ast.Link:
r.processLink(v.Destination)
return ast.WalkSkipChildren, nil
case *ast.AutoLink:
// This could be a reference to an issue or pull - if so convert it
r.processAutoLink(w, v.URL(source))
return ast.WalkSkipChildren, nil
}
return ast.WalkContinue, nil
})
}
func (r *stripRenderer) doubleSpace(w io.Writer) {
if !r.empty {
_, _ = w.Write([]byte{'\n'})
}
}
func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) {
// Always break-up words
if !coalesce {
r.doubleSpace(w)
}
_, _ = w.Write(text)
r.empty = false
}
// ProcessAutoLinks to detect and handle links to issues and pulls
func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) {
linkStr := string(link)
u, err := url.Parse(linkStr)
if err != nil {
// Process out of band
r.links = append(r.links, linkStr)
return
}
// Note: we're not attempting to match the URL scheme (http/https)
host := strings.ToLower(u.Host)
if host != "" && host != strings.ToLower(r.localhost.Host) {
// Process out of band
r.links = append(r.links, linkStr)
return
}
// We want: /user/repo/issues/3
parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/")
if len(parts) != 5 || parts[0] != "" {
// Process out of band
r.links = append(r.links, linkStr)
return
}
var sep string
if parts[3] == "issues" {
sep = "#"
} else if parts[3] == "pulls" {
sep = "!"
} else {
// Process out of band
r.links = append(r.links, linkStr)
return
}
_, _ = w.Write([]byte(parts[1]))
_, _ = w.Write([]byte("/"))
_, _ = w.Write([]byte(parts[2]))
_, _ = w.Write([]byte(sep))
_, _ = w.Write([]byte(parts[4]))
}
func (r *stripRenderer) processLink(link []byte) {
// Links are processed out of band
r.links = append(r.links, string(link))
}
// GetLinks returns the list of link data collected while parsing
func (r *stripRenderer) GetLinks() []string {
return r.links
}
// AddOptions adds given option to this renderer.
func (r *stripRenderer) AddOptions(...renderer.Option) {
// no-op
}
// StripMarkdown parses markdown content by removing all markup and code blocks
// in order to extract links and other references
func StripMarkdown(rawBytes []byte) (string, []string) {
buf, links := StripMarkdownBytes(rawBytes)
return string(buf), links
}
var (
stripParser parser.Parser
once = sync.Once{}
)
// StripMarkdownBytes parses markdown content by removing all markup and code blocks
// in order to extract links and other references
func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) {
once.Do(func() {
gdMarkdown := goldmark.New(
goldmark.WithExtensions(extension.Table,
extension.Strikethrough,
extension.TaskList,
extension.DefinitionList,
common.FootnoteExtension,
common.Linkify,
),
goldmark.WithParserOptions(
parser.WithAttribute(),
parser.WithAutoHeadingID(),
),
goldmark.WithRendererOptions(
html.WithUnsafe(),
),
)
stripParser = gdMarkdown.Parser()
})
stripper := &stripRenderer{
localhost: getGiteaHost(),
links: make([]string, 0, 10),
empty: true,
}
reader := text.NewReader(rawBytes)
doc := stripParser.Parse(reader)
var buf bytes.Buffer
if err := stripper.Render(&buf, rawBytes, doc); err != nil {
log.Error("Unable to strip: %v", err)
}
return buf.Bytes(), stripper.GetLinks()
}
// getGiteaHostName returns a normalized string with the local host name, with no scheme or port information
func getGiteaHost() *url.URL {
giteaHostInit.Do(func() {
var err error
if giteaHost, err = url.Parse(setting.AppURL); err != nil {
giteaHost = &url.URL{}
}
})
return giteaHost
}