mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2024-11-25 13:16:40 +00:00
[bugfix] Use better plaintext representation of status for filtering (#3301)
* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
This commit is contained in:
parent
6dd936fbe1
commit
efd1a4f717
|
@ -273,6 +273,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia
|
||||||
- [jackc/pgconn](https://github.com/jackc/pgconn); Postgres driver. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [jackc/pgconn](https://github.com/jackc/pgconn); Postgres driver. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [jackc/pgx](https://github.com/jackc/pgx); Postgres driver and toolkit. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [jackc/pgx](https://github.com/jackc/pgx); Postgres driver and toolkit. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [KimMachineGun/automemlimit](https://github.com/KimMachineGun/automemlimit); cgroups memory limit checking. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [KimMachineGun/automemlimit](https://github.com/KimMachineGun/automemlimit); cgroups memory limit checking. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
|
- [k3a/html2text](https://github.com/k3a/html2text); HTML-to-text conversion. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [mcuadros/go-syslog](https://github.com/mcuadros/go-syslog); Syslog server library. [MIT License](https://spdx.org/licenses/MIT.html).
|
- [mcuadros/go-syslog](https://github.com/mcuadros/go-syslog); Syslog server library. [MIT License](https://spdx.org/licenses/MIT.html).
|
||||||
- [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
|
- [microcosm-cc/bluemonday](https://github.com/microcosm-cc/bluemonday); HTML user-input sanitization. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).
|
||||||
- [miekg/dns](https://github.com/miekg/dns); DNS utilities. [Go License](https://go.dev/LICENSE).
|
- [miekg/dns](https://github.com/miekg/dns); DNS utilities. [Go License](https://go.dev/LICENSE).
|
||||||
|
|
1
go.mod
1
go.mod
|
@ -40,6 +40,7 @@ require (
|
||||||
github.com/gorilla/feeds v1.2.0
|
github.com/gorilla/feeds v1.2.0
|
||||||
github.com/gorilla/websocket v1.5.2
|
github.com/gorilla/websocket v1.5.2
|
||||||
github.com/jackc/pgx/v5 v5.7.1
|
github.com/jackc/pgx/v5 v5.7.1
|
||||||
|
github.com/k3a/html2text v1.2.1
|
||||||
github.com/microcosm-cc/bluemonday v1.0.27
|
github.com/microcosm-cc/bluemonday v1.0.27
|
||||||
github.com/miekg/dns v1.1.62
|
github.com/miekg/dns v1.1.62
|
||||||
github.com/minio/minio-go/v7 v7.0.76
|
github.com/minio/minio-go/v7 v7.0.76
|
||||||
|
|
2
go.sum
2
go.sum
|
@ -384,6 +384,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
|
||||||
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
|
github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
|
||||||
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
|
github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
|
||||||
github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
|
github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88/go.mod h1:3w7q1U84EfirKl04SVQ/s7nPm1ZPhiXd34z40TNz36k=
|
||||||
|
github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY=
|
||||||
|
github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA=
|
||||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||||
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||||
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
|
||||||
|
|
20
internal/cache/cache.go
vendored
20
internal/cache/cache.go
vendored
|
@ -47,6 +47,11 @@ type Caches struct {
|
||||||
// Webfinger provides access to the webfinger URL cache.
|
// Webfinger provides access to the webfinger URL cache.
|
||||||
Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min
|
Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min
|
||||||
|
|
||||||
|
// TTL cache of statuses -> filterable text fields.
|
||||||
|
// To ensure up-to-date fields, cache is keyed as:
|
||||||
|
// `[status.ID][status.UpdatedAt.Unix()]`
|
||||||
|
StatusesFilterableFields *ttl.Cache[string, []string]
|
||||||
|
|
||||||
// prevent pass-by-value.
|
// prevent pass-by-value.
|
||||||
_ nocopy
|
_ nocopy
|
||||||
}
|
}
|
||||||
|
@ -109,6 +114,7 @@ func (c *Caches) Init() {
|
||||||
c.initUserMuteIDs()
|
c.initUserMuteIDs()
|
||||||
c.initWebfinger()
|
c.initWebfinger()
|
||||||
c.initVisibility()
|
c.initVisibility()
|
||||||
|
c.initStatusesFilterableFields()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start will start any caches that require a background
|
// Start will start any caches that require a background
|
||||||
|
@ -119,6 +125,10 @@ func (c *Caches) Start() {
|
||||||
tryUntil("starting webfinger cache", 5, func() bool {
|
tryUntil("starting webfinger cache", 5, func() bool {
|
||||||
return c.Webfinger.Start(5 * time.Minute)
|
return c.Webfinger.Start(5 * time.Minute)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
tryUntil("starting statusesFilterableFields cache", 5, func() bool {
|
||||||
|
return c.StatusesFilterableFields.Start(5 * time.Minute)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop will stop any caches that require a background
|
// Stop will stop any caches that require a background
|
||||||
|
@ -127,6 +137,7 @@ func (c *Caches) Stop() {
|
||||||
log.Infof(nil, "stop: %p", c)
|
log.Infof(nil, "stop: %p", c)
|
||||||
|
|
||||||
tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop)
|
tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop)
|
||||||
|
tryUntil("stopping statusesFilterableFields cache", 5, c.StatusesFilterableFields.Stop)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sweep will sweep all the available caches to ensure none
|
// Sweep will sweep all the available caches to ensure none
|
||||||
|
@ -204,3 +215,12 @@ func (c *Caches) initWebfinger() {
|
||||||
24*time.Hour,
|
24*time.Hour,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Caches) initStatusesFilterableFields() {
|
||||||
|
c.StatusesFilterableFields = new(ttl.Cache[string, []string])
|
||||||
|
c.StatusesFilterableFields.Init(
|
||||||
|
0,
|
||||||
|
512,
|
||||||
|
1*time.Hour,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@
|
||||||
import (
|
import (
|
||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Filter stores a filter created by a local account.
|
// Filter stores a filter created by a local account.
|
||||||
|
@ -61,14 +63,23 @@ type FilterKeyword struct {
|
||||||
|
|
||||||
// Compile will compile this FilterKeyword as a prepared regular expression.
|
// Compile will compile this FilterKeyword as a prepared regular expression.
|
||||||
func (k *FilterKeyword) Compile() (err error) {
|
func (k *FilterKeyword) Compile() (err error) {
|
||||||
var wordBreak string
|
var (
|
||||||
if k.WholeWord != nil && *k.WholeWord {
|
wordBreakStart string
|
||||||
wordBreak = `\b`
|
wordBreakEnd string
|
||||||
|
)
|
||||||
|
|
||||||
|
if util.PtrOrZero(k.WholeWord) {
|
||||||
|
// Either word boundary or
|
||||||
|
// whitespace or start of line.
|
||||||
|
wordBreakStart = `(?:\b|\s|^)`
|
||||||
|
// Either word boundary or
|
||||||
|
// whitespace or end of line.
|
||||||
|
wordBreakEnd = `(?:\b|\s|$)`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compile keyword filter regexp.
|
// Compile keyword filter regexp.
|
||||||
quoted := regexp.QuoteMeta(k.Keyword)
|
quoted := regexp.QuoteMeta(k.Keyword)
|
||||||
k.Regexp, err = regexp.Compile(`(?i)` + wordBreak + quoted + wordBreak)
|
k.Regexp, err = regexp.Compile(`(?i)` + wordBreakStart + quoted + wordBreakEnd)
|
||||||
return // caller is expected to wrap this error
|
return // caller is expected to wrap this error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,8 @@
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"slices"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -35,7 +37,6 @@
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/language"
|
"github.com/superseriousbusiness/gotosocial/internal/language"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/log"
|
"github.com/superseriousbusiness/gotosocial/internal/log"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/media"
|
"github.com/superseriousbusiness/gotosocial/internal/media"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/text"
|
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/uris"
|
"github.com/superseriousbusiness/gotosocial/internal/uris"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/util"
|
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||||
)
|
)
|
||||||
|
@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract text fields from the status that we will match filters against.
|
// Key this status based on ID + last updated time,
|
||||||
fields := filterableTextFields(s)
|
// to ensure we always filter on latest version.
|
||||||
|
statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)
|
||||||
|
|
||||||
|
// Check if we have filterable fields cached for this status.
|
||||||
|
cache := c.state.Caches.StatusesFilterableFields
|
||||||
|
fields, stored := cache.Get(statusKey)
|
||||||
|
if !stored {
|
||||||
|
// We don't have filterable fields
|
||||||
|
// cached, calculate + cache now.
|
||||||
|
fields = filterableFields(s)
|
||||||
|
cache.Set(statusKey, fields)
|
||||||
|
}
|
||||||
|
|
||||||
// Record all matching warn filters and the reasons they matched.
|
// Record all matching warn filters and the reasons they matched.
|
||||||
filterResults := make([]apimodel.FilterResult, 0, len(filters))
|
filterResults := make([]apimodel.FilterResult, 0, len(filters))
|
||||||
for _, filter := range filters {
|
for _, filter := range filters {
|
||||||
if !filterAppliesInContext(filter, filterContext) {
|
if !filterAppliesInContext(filter, filterContext) {
|
||||||
// Filter doesn't apply to this context.
|
// Filter doesn't apply
|
||||||
continue
|
// to this context.
|
||||||
}
|
|
||||||
if filter.Expired(now) {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// List all matching keywords.
|
if filter.Expired(now) {
|
||||||
|
// Filter doesn't
|
||||||
|
// apply anymore.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assemble matching keywords (if any) from this filter.
|
||||||
keywordMatches := make([]string, 0, len(filter.Keywords))
|
keywordMatches := make([]string, 0, len(filter.Keywords))
|
||||||
for _, filterKeyword := range filter.Keywords {
|
for _, keyword := range filter.Keywords {
|
||||||
var isMatch bool
|
// Check if at least one filterable field
|
||||||
for _, field := range fields {
|
// in the status matches on this filter.
|
||||||
if filterKeyword.Regexp.MatchString(field) {
|
if slices.ContainsFunc(
|
||||||
isMatch = true
|
fields,
|
||||||
break
|
func(field string) bool {
|
||||||
}
|
return keyword.Regexp.MatchString(field)
|
||||||
}
|
},
|
||||||
if isMatch {
|
) {
|
||||||
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
|
// At least one field matched on this filter.
|
||||||
|
keywordMatches = append(keywordMatches, keyword.Keyword)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
|
||||||
return filterResults, nil
|
return filterResults, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// filterableTextFields returns all text from a status that we might want to filter on:
|
|
||||||
// - content
|
|
||||||
// - content warning
|
|
||||||
// - media descriptions
|
|
||||||
// - poll options
|
|
||||||
func filterableTextFields(s *gtsmodel.Status) []string {
|
|
||||||
fieldCount := 2 + len(s.Attachments)
|
|
||||||
if s.Poll != nil {
|
|
||||||
fieldCount += len(s.Poll.Options)
|
|
||||||
}
|
|
||||||
fields := make([]string, 0, fieldCount)
|
|
||||||
|
|
||||||
if s.Content != "" {
|
|
||||||
fields = append(fields, text.SanitizeToPlaintext(s.Content))
|
|
||||||
}
|
|
||||||
if s.ContentWarning != "" {
|
|
||||||
fields = append(fields, s.ContentWarning)
|
|
||||||
}
|
|
||||||
for _, attachment := range s.Attachments {
|
|
||||||
if attachment.Description != "" {
|
|
||||||
fields = append(fields, attachment.Description)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if s.Poll != nil {
|
|
||||||
for _, option := range s.Poll.Options {
|
|
||||||
if option != "" {
|
|
||||||
fields = append(fields, option)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return fields
|
|
||||||
}
|
|
||||||
|
|
||||||
// filterAppliesInContext returns whether a given filter applies in a given context.
|
// filterAppliesInContext returns whether a given filter applies in a given context.
|
||||||
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
|
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
|
||||||
switch filterContext {
|
switch filterContext {
|
||||||
|
|
|
@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {
|
||||||
|
|
||||||
// Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
|
// Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
|
||||||
func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
|
func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
|
||||||
testStatus := suite.testStatuses["admin_account_status_1"]
|
testStatus := new(gtsmodel.Status)
|
||||||
|
*testStatus = *suite.testStatuses["admin_account_status_1"]
|
||||||
testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`
|
testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`
|
||||||
|
|
||||||
if boost {
|
if boost {
|
||||||
// Modify a fixture boost into a boost of the above status.
|
boost, err := suite.typeconverter.StatusToBoost(
|
||||||
boostStatus := suite.testStatuses["admin_account_status_4"]
|
context.Background(),
|
||||||
boostStatus.BoostOf = testStatus
|
testStatus,
|
||||||
boostStatus.BoostOfID = testStatus.ID
|
suite.testAccounts["admin_account"],
|
||||||
testStatus = boostStatus
|
"",
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
suite.FailNow(err.Error())
|
||||||
|
}
|
||||||
|
testStatus = boost
|
||||||
}
|
}
|
||||||
|
|
||||||
requestingAccount := suite.testAccounts["local_account_1"]
|
requestingAccount := suite.testAccounts["local_account_1"]
|
||||||
|
@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh
|
||||||
[]*gtsmodel.Filter{filter},
|
[]*gtsmodel.Filter{filter},
|
||||||
nil,
|
nil,
|
||||||
)
|
)
|
||||||
if suite.NoError(err) {
|
if err != nil {
|
||||||
suite.NotEmpty(apiStatus.Filtered)
|
suite.FailNow(err.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
suite.NotEmpty(apiStatus.Filtered)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {
|
func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/k3a/html2text"
|
||||||
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
|
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/config"
|
"github.com/superseriousbusiness/gotosocial/internal/config"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||||
|
@ -284,3 +285,64 @@ func ContentToContentLanguage(
|
||||||
|
|
||||||
return contentStr, langTagStr
|
return contentStr, langTagStr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// filterableFields returns text fields from
|
||||||
|
// a status that we might want to filter on:
|
||||||
|
//
|
||||||
|
// - content warning
|
||||||
|
// - content (converted to plaintext from HTML)
|
||||||
|
// - media descriptions
|
||||||
|
// - poll options
|
||||||
|
//
|
||||||
|
// Each field should be filtered separately.
|
||||||
|
// This avoids scenarios where false-positive
|
||||||
|
// multiple-word matches can be made by matching
|
||||||
|
// the last word of one field + the first word
|
||||||
|
// of the next field together.
|
||||||
|
func filterableFields(s *gtsmodel.Status) []string {
|
||||||
|
// Estimate length of fields.
|
||||||
|
fieldCount := 2 + len(s.Attachments)
|
||||||
|
if s.Poll != nil {
|
||||||
|
fieldCount += len(s.Poll.Options)
|
||||||
|
}
|
||||||
|
fields := make([]string, 0, fieldCount)
|
||||||
|
|
||||||
|
// Content warning / title.
|
||||||
|
if s.ContentWarning != "" {
|
||||||
|
fields = append(fields, s.ContentWarning)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Status content. Though we have raw text
|
||||||
|
// available for statuses created on our
|
||||||
|
// instance, use the html2text version to
|
||||||
|
// remove markdown-formatting characters
|
||||||
|
// and ensure more consistent filtering.
|
||||||
|
if s.Content != "" {
|
||||||
|
text := html2text.HTML2TextWithOptions(
|
||||||
|
s.Content,
|
||||||
|
html2text.WithLinksInnerText(),
|
||||||
|
html2text.WithUnixLineBreaks(),
|
||||||
|
)
|
||||||
|
if text != "" {
|
||||||
|
fields = append(fields, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Media descriptions.
|
||||||
|
for _, attachment := range s.Attachments {
|
||||||
|
if attachment.Description != "" {
|
||||||
|
fields = append(fields, attachment.Description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Poll options.
|
||||||
|
if s.Poll != nil {
|
||||||
|
for _, opt := range s.Poll.Options {
|
||||||
|
if opt != "" {
|
||||||
|
fields = append(fields, opt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fields
|
||||||
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
"context"
|
"context"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/config"
|
"github.com/superseriousbusiness/gotosocial/internal/config"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/language"
|
"github.com/superseriousbusiness/gotosocial/internal/language"
|
||||||
|
@ -158,3 +159,62 @@ type testcase struct {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFilterableText(t *testing.T) {
|
||||||
|
type testcase struct {
|
||||||
|
status *gtsmodel.Status
|
||||||
|
expectedFields []string
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, testcase := range []testcase{
|
||||||
|
{
|
||||||
|
status: >smodel.Status{
|
||||||
|
ContentWarning: "This is a test status",
|
||||||
|
Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
|
||||||
|
},
|
||||||
|
expectedFields: []string{
|
||||||
|
"This is a test status",
|
||||||
|
"Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
status: >smodel.Status{
|
||||||
|
Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
|
||||||
|
},
|
||||||
|
expectedFields: []string{
|
||||||
|
"@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
status: >smodel.Status{
|
||||||
|
ContentWarning: "Nerd stuff",
|
||||||
|
Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
|
||||||
|
Attachments: []*gtsmodel.MediaAttachment{
|
||||||
|
{
|
||||||
|
Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Description: `Another media attachment`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Poll: >smodel.Poll{
|
||||||
|
Options: []string{
|
||||||
|
"Poll option 1",
|
||||||
|
"Poll option 2",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectedFields: []string{
|
||||||
|
"Nerd stuff",
|
||||||
|
"Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.",
|
||||||
|
"Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.",
|
||||||
|
"Another media attachment",
|
||||||
|
"Poll option 1",
|
||||||
|
"Poll option 2",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
fields := filterableFields(testcase.status)
|
||||||
|
assert.Equal(t, testcase.expectedFields, fields)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
10
vendor/github.com/k3a/html2text/.travis.yml
generated
vendored
Normal file
10
vendor/github.com/k3a/html2text/.travis.yml
generated
vendored
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
language: go
|
||||||
|
go:
|
||||||
|
- master
|
||||||
|
before_install:
|
||||||
|
- go get github.com/axw/gocov/gocov
|
||||||
|
- go get github.com/mattn/goveralls
|
||||||
|
- if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
|
||||||
|
script:
|
||||||
|
- $HOME/gopath/bin/goveralls -service=travis-ci
|
||||||
|
|
21
vendor/github.com/k3a/html2text/LICENSE
generated
vendored
Normal file
21
vendor/github.com/k3a/html2text/LICENSE
generated
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
60
vendor/github.com/k3a/html2text/README.md
generated
vendored
Normal file
60
vendor/github.com/k3a/html2text/README.md
generated
vendored
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
[![GoDoc](https://godoc.org/github.com/k3a/html2text?status.svg)](https://godoc.org/github.com/k3a/html2text)
|
||||||
|
[![Build Status](https://travis-ci.org/k3a/html2text.svg?branch=master)](https://travis-ci.org/k3a/html2text)
|
||||||
|
[![Coverage Status](https://coveralls.io/repos/github/k3a/html2text/badge.svg?branch=master)](https://coveralls.io/github/k3a/html2text?branch=master)
|
||||||
|
[![Report Card](https://goreportcard.com/badge/github.com/k3a/html2text)](https://goreportcard.com/report/github.com/k3a/html2text)
|
||||||
|
|
||||||
|
# html2text
|
||||||
|
|
||||||
|
A simple Golang package to convert HTML to plain text (without non-standard dependencies).
|
||||||
|
|
||||||
|
It converts HTML tags to text and also parses HTML entities into characters they represent.
|
||||||
|
A `<head>` section of the HTML document, as well as most other tags are stripped out but
|
||||||
|
links are properly converted into their href attribute.
|
||||||
|
|
||||||
|
It can be used for converting HTML emails into text.
|
||||||
|
|
||||||
|
Some tests are installed as well.
|
||||||
|
Uses semantic versioning and no breaking changes are planned.
|
||||||
|
|
||||||
|
Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
|
||||||
|
|
||||||
|
## Install
|
||||||
|
```bash
|
||||||
|
go get github.com/k3a/html2text
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/k3a/html2text"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
|
||||||
|
|
||||||
|
plain := html2text.HTML2Text(html)
|
||||||
|
|
||||||
|
fmt.Println(plain)
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Outputs:
|
||||||
|
|
||||||
|
clean text
|
||||||
|
*/
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
To see all features, please look info `html2text_test.go`.
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
- https://github.com/jaytaylor/html2text (heavier, with more features)
|
||||||
|
- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
|
|
2046
vendor/github.com/k3a/html2text/entity.go
generated
vendored
Normal file
2046
vendor/github.com/k3a/html2text/entity.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
333
vendor/github.com/k3a/html2text/html2text.go
generated
vendored
Normal file
333
vendor/github.com/k3a/html2text/html2text.go
generated
vendored
Normal file
|
@ -0,0 +1,333 @@
|
||||||
|
package html2text
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Line break constants
|
||||||
|
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||||
|
const (
|
||||||
|
WIN_LBR = "\r\n"
|
||||||
|
UNIX_LBR = "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
var legacyLBR = WIN_LBR
|
||||||
|
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
|
||||||
|
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
|
||||||
|
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
|
||||||
|
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
|
||||||
|
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
|
||||||
|
|
||||||
|
type options struct {
|
||||||
|
lbr string
|
||||||
|
linksInnerText bool
|
||||||
|
listPrefix string
|
||||||
|
}
|
||||||
|
|
||||||
|
func newOptions() *options {
|
||||||
|
// apply defaults
|
||||||
|
return &options{
|
||||||
|
lbr: WIN_LBR,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Option is a functional option
|
||||||
|
type Option func(*options)
|
||||||
|
|
||||||
|
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
|
||||||
|
func WithUnixLineBreaks() Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.lbr = UNIX_LBR
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
|
||||||
|
// Example: click news <http://bit.ly/2n4wXRs>
|
||||||
|
func WithLinksInnerText() Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.linksInnerText = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
|
||||||
|
func WithListSupportPrefix(prefix string) Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.listPrefix = prefix
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithListSupport formats <ul> and <li> lists with " - " prefix
|
||||||
|
func WithListSupport() Option {
|
||||||
|
return WithListSupportPrefix(" - ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseHTMLEntity(entName string) (string, bool) {
|
||||||
|
if r, ok := entity[entName]; ok {
|
||||||
|
return string(r), true
|
||||||
|
}
|
||||||
|
|
||||||
|
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
|
||||||
|
var (
|
||||||
|
err error
|
||||||
|
n int64
|
||||||
|
digits = match[1]
|
||||||
|
)
|
||||||
|
|
||||||
|
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
|
||||||
|
n, err = strconv.ParseInt(digits[1:], 16, 64)
|
||||||
|
} else {
|
||||||
|
n, err = strconv.ParseInt(digits, 10, 64)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
|
||||||
|
return string(rune(n)), true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
|
||||||
|
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
|
||||||
|
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||||
|
func SetUnixLbr(b bool) {
|
||||||
|
if b {
|
||||||
|
legacyLBR = UNIX_LBR
|
||||||
|
} else {
|
||||||
|
legacyLBR = WIN_LBR
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTMLEntitiesToText decodes HTML entities inside a provided
|
||||||
|
// string and returns decoded text
|
||||||
|
func HTMLEntitiesToText(htmlEntsText string) string {
|
||||||
|
outBuf := bytes.NewBufferString("")
|
||||||
|
inEnt := false
|
||||||
|
|
||||||
|
for i, r := range htmlEntsText {
|
||||||
|
switch {
|
||||||
|
case r == ';' && inEnt:
|
||||||
|
inEnt = false
|
||||||
|
continue
|
||||||
|
|
||||||
|
case r == '&': //possible html entity
|
||||||
|
entName := ""
|
||||||
|
isEnt := false
|
||||||
|
|
||||||
|
// parse the entity name - max 10 chars
|
||||||
|
chars := 0
|
||||||
|
for _, er := range htmlEntsText[i+1:] {
|
||||||
|
if er == ';' {
|
||||||
|
isEnt = true
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
entName += string(er)
|
||||||
|
}
|
||||||
|
|
||||||
|
chars++
|
||||||
|
if chars == 10 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if isEnt {
|
||||||
|
if ent, isEnt := parseHTMLEntity(entName); isEnt {
|
||||||
|
outBuf.WriteString(ent)
|
||||||
|
inEnt = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !inEnt {
|
||||||
|
outBuf.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return outBuf.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSpace(outBuf *bytes.Buffer) {
|
||||||
|
bts := outBuf.Bytes()
|
||||||
|
if len(bts) > 0 && bts[len(bts)-1] != ' ' {
|
||||||
|
outBuf.WriteString(" ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML2Text converts html into a text form
|
||||||
|
func HTML2Text(html string) string {
|
||||||
|
var opts []Option
|
||||||
|
if legacyLBR == UNIX_LBR {
|
||||||
|
opts = append(opts, WithUnixLineBreaks())
|
||||||
|
}
|
||||||
|
return HTML2TextWithOptions(html, opts...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML2TextWithOptions converts html into a text form with additional options
|
||||||
|
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
|
||||||
|
opts := newOptions()
|
||||||
|
for _, opt := range reqOpts {
|
||||||
|
opt(opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
inLen := len(html)
|
||||||
|
tagStart := 0
|
||||||
|
inEnt := false
|
||||||
|
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
|
||||||
|
shouldOutput := true
|
||||||
|
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
|
||||||
|
hrefs := []string{}
|
||||||
|
// new line cannot be printed at the beginning or
|
||||||
|
// for <p> after a new line created by previous <p></p>
|
||||||
|
canPrintNewline := false
|
||||||
|
|
||||||
|
outBuf := bytes.NewBufferString("")
|
||||||
|
|
||||||
|
for i, r := range html {
|
||||||
|
if inLen > 0 && i == inLen-1 {
|
||||||
|
// prevent new line at the end of the document
|
||||||
|
canPrintNewline = false
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
// skip new lines and spaces adding a single space if not there yet
|
||||||
|
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
|
||||||
|
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
|
||||||
|
if shouldOutput && badTagStackDepth == 0 && !inEnt {
|
||||||
|
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
|
||||||
|
writeSpace(outBuf)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
|
||||||
|
case r == ';' && inEnt: // end of html entity
|
||||||
|
inEnt = false
|
||||||
|
continue
|
||||||
|
|
||||||
|
case r == '&' && shouldOutput: // possible html entity
|
||||||
|
entName := ""
|
||||||
|
isEnt := false
|
||||||
|
|
||||||
|
// parse the entity name - max 10 chars
|
||||||
|
chars := 0
|
||||||
|
for _, er := range html[i+1:] {
|
||||||
|
if er == ';' {
|
||||||
|
isEnt = true
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
entName += string(er)
|
||||||
|
}
|
||||||
|
|
||||||
|
chars++
|
||||||
|
if chars == 10 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if isEnt {
|
||||||
|
if ent, isEnt := parseHTMLEntity(entName); isEnt {
|
||||||
|
outBuf.WriteString(ent)
|
||||||
|
inEnt = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case r == '<': // start of a tag
|
||||||
|
tagStart = i + 1
|
||||||
|
shouldOutput = false
|
||||||
|
continue
|
||||||
|
|
||||||
|
case r == '>': // end of a tag
|
||||||
|
shouldOutput = true
|
||||||
|
tag := html[tagStart:i]
|
||||||
|
tagNameLowercase := strings.ToLower(tag)
|
||||||
|
|
||||||
|
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
|
||||||
|
outBuf.WriteString(opts.lbr)
|
||||||
|
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
|
||||||
|
if opts.listPrefix != "" {
|
||||||
|
outBuf.WriteString(opts.lbr + opts.listPrefix)
|
||||||
|
} else {
|
||||||
|
outBuf.WriteString(opts.lbr)
|
||||||
|
}
|
||||||
|
} else if headersRE.MatchString(tagNameLowercase) {
|
||||||
|
if canPrintNewline {
|
||||||
|
outBuf.WriteString(opts.lbr + opts.lbr)
|
||||||
|
}
|
||||||
|
canPrintNewline = false
|
||||||
|
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
|
||||||
|
// new line
|
||||||
|
outBuf.WriteString(opts.lbr)
|
||||||
|
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
|
||||||
|
if canPrintNewline {
|
||||||
|
outBuf.WriteString(opts.lbr + opts.lbr)
|
||||||
|
}
|
||||||
|
canPrintNewline = false
|
||||||
|
} else if opts.linksInnerText && tagNameLowercase == "/a" {
|
||||||
|
// end of link
|
||||||
|
// links can be empty can happen if the link matches the badLinkHrefRE
|
||||||
|
if len(hrefs) > 0 {
|
||||||
|
outBuf.WriteString(" <")
|
||||||
|
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
|
||||||
|
outBuf.WriteString(">")
|
||||||
|
hrefs = hrefs[1:]
|
||||||
|
}
|
||||||
|
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
|
||||||
|
// parse link href
|
||||||
|
// add special handling for a tags
|
||||||
|
m := linkTagRE.FindStringSubmatch(tag)
|
||||||
|
if len(m) == 5 {
|
||||||
|
link := m[2]
|
||||||
|
if len(link) == 0 {
|
||||||
|
link = m[3]
|
||||||
|
if len(link) == 0 {
|
||||||
|
link = m[4]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
|
||||||
|
hrefs = append(hrefs, link)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if badTagnamesRE.MatchString(tagNameLowercase) {
|
||||||
|
// unwanted block
|
||||||
|
badTagStackDepth++
|
||||||
|
|
||||||
|
// if link inner text preservation is not enabled
|
||||||
|
// and the current tag is a link tag, parse its href and output that
|
||||||
|
if !opts.linksInnerText {
|
||||||
|
// parse link href
|
||||||
|
m := linkTagRE.FindStringSubmatch(tag)
|
||||||
|
if len(m) == 5 {
|
||||||
|
link := m[2]
|
||||||
|
if len(link) == 0 {
|
||||||
|
link = m[3]
|
||||||
|
if len(link) == 0 {
|
||||||
|
link = m[4]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !badLinkHrefRE.MatchString(link) {
|
||||||
|
outBuf.WriteString(HTMLEntitiesToText(link))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
|
||||||
|
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
|
||||||
|
// end of unwanted block
|
||||||
|
badTagStackDepth--
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
|
||||||
|
} // switch end
|
||||||
|
|
||||||
|
if shouldOutput && badTagStackDepth == 0 && !inEnt {
|
||||||
|
canPrintNewline = true
|
||||||
|
outBuf.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return outBuf.String()
|
||||||
|
}
|
3
vendor/modules.txt
vendored
3
vendor/modules.txt
vendored
|
@ -446,6 +446,9 @@ github.com/josharian/intern
|
||||||
# github.com/json-iterator/go v1.1.12
|
# github.com/json-iterator/go v1.1.12
|
||||||
## explicit; go 1.12
|
## explicit; go 1.12
|
||||||
github.com/json-iterator/go
|
github.com/json-iterator/go
|
||||||
|
# github.com/k3a/html2text v1.2.1
|
||||||
|
## explicit; go 1.16
|
||||||
|
github.com/k3a/html2text
|
||||||
# github.com/klauspost/compress v1.17.9
|
# github.com/klauspost/compress v1.17.9
|
||||||
## explicit; go 1.20
|
## explicit; go 1.20
|
||||||
github.com/klauspost/compress
|
github.com/klauspost/compress
|
||||||
|
|
Loading…
Reference in a new issue