gotosocial/internal/regexes/regexes.go

191 lines
9 KiB
Go
Raw Permalink Normal View History

// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
2021-09-01 16:29:25 +00:00
package regexes
import (
[bugfix] Fix existing bio text showing as HTML (#531) * fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim <grufwub@gmail.com> * go away linter Signed-off-by: kim <grufwub@gmail.com> * change buf reset location, change html mention tags Signed-off-by: kim <grufwub@gmail.com> * reduce FindLinks code complexity Signed-off-by: kim <grufwub@gmail.com> * fix HTML to text conversion Signed-off-by: kim <grufwub@gmail.com> * Update internal/regexes/regexes.go Co-authored-by: Mina Galić <mina.galic@puppet.com> * use improved html2text lib with more options Signed-off-by: kim <grufwub@gmail.com> * fix to produce actual plaintext from html Signed-off-by: kim <grufwub@gmail.com> * fix span tags instead written as space Signed-off-by: kim <grufwub@gmail.com> * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim <grufwub@gmail.com> * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim <grufwub@gmail.com> * use matched input string for link replace href text Signed-off-by: kim <grufwub@gmail.com> * remove unused code (to appease linter :sobs:) Signed-off-by: kim <grufwub@gmail.com> * improve hashtagFinger regex to be more compliant Signed-off-by: kim <grufwub@gmail.com> * update breakReplacer to include both unix and windows line endings Signed-off-by: kim <grufwub@gmail.com> * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim <grufwub@gmail.com> * drop unnecessary code Signed-off-by: kim <grufwub@gmail.com> * update text package tests to fix logic changes Signed-off-by: kim <grufwub@gmail.com> * add raw note content testing to account update and account verify Signed-off-by: kim <grufwub@gmail.com> * remove unused modules Signed-off-by: kim <grufwub@gmail.com> * fix emoji regex Signed-off-by: kim <grufwub@gmail.com> * fix replacement of hashtags Signed-off-by: kim <grufwub@gmail.com> * update code comment Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: Mina Galić <mina.galic@puppet.com>
2022-05-07 15:55:27 +00:00
"bytes"
2021-09-01 16:29:25 +00:00
"regexp"
[bugfix] Fix existing bio text showing as HTML (#531) * fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim <grufwub@gmail.com> * go away linter Signed-off-by: kim <grufwub@gmail.com> * change buf reset location, change html mention tags Signed-off-by: kim <grufwub@gmail.com> * reduce FindLinks code complexity Signed-off-by: kim <grufwub@gmail.com> * fix HTML to text conversion Signed-off-by: kim <grufwub@gmail.com> * Update internal/regexes/regexes.go Co-authored-by: Mina Galić <mina.galic@puppet.com> * use improved html2text lib with more options Signed-off-by: kim <grufwub@gmail.com> * fix to produce actual plaintext from html Signed-off-by: kim <grufwub@gmail.com> * fix span tags instead written as space Signed-off-by: kim <grufwub@gmail.com> * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim <grufwub@gmail.com> * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim <grufwub@gmail.com> * use matched input string for link replace href text Signed-off-by: kim <grufwub@gmail.com> * remove unused code (to appease linter :sobs:) Signed-off-by: kim <grufwub@gmail.com> * improve hashtagFinger regex to be more compliant Signed-off-by: kim <grufwub@gmail.com> * update breakReplacer to include both unix and windows line endings Signed-off-by: kim <grufwub@gmail.com> * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim <grufwub@gmail.com> * drop unnecessary code Signed-off-by: kim <grufwub@gmail.com> * update text package tests to fix logic changes Signed-off-by: kim <grufwub@gmail.com> * add raw note content testing to account update and account verify Signed-off-by: kim <grufwub@gmail.com> * remove unused modules Signed-off-by: kim <grufwub@gmail.com> * fix emoji regex Signed-off-by: kim <grufwub@gmail.com> * fix replacement of hashtags Signed-off-by: kim <grufwub@gmail.com> * update code comment Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: Mina Galić <mina.galic@puppet.com>
2022-05-07 15:55:27 +00:00
"sync"
"mvdan.cc/xurls/v2"
2021-09-01 16:29:25 +00:00
)
const (
2021-09-02 10:24:18 +00:00
users = "users"
actors = "actors"
statuses = "statuses"
inbox = "inbox"
outbox = "outbox"
followers = "followers"
following = "following"
liked = "liked"
publicKey = "main-key"
follow = "follow"
blocks = "blocks"
reports = "reports"
accepts = "accepts"
2021-09-01 16:29:25 +00:00
schemes = `(http|https)://` // Allowed URI protocols for parsing links in text.
alphaNumeric = `\p{L}\p{M}*|\p{N}` // A single number or script character in any language, including chars with accents.
usernameGrp = `(?:` + alphaNumeric + `|\.|\-|\_)` // Non-capturing group that matches against a single valid username character.
domainGrp = `(?:` + alphaNumeric + `|\.|\-|\:)` // Non-capturing group that matches against a single valid domain character.
mentionName = `^@(` + usernameGrp + `+)(?:@(` + domainGrp + `+))?$` // Extract parts of one mention, maybe including domain.
mentionFinder = `(?:^|\s)(@` + usernameGrp + `+(?:@` + domainGrp + `+)?)` // Extract all mentions from a text, each mention may include domain.
emojiShortcode = `\w{1,30}` // Pattern for emoji shortcodes. maximumEmojiShortcodeLength = 30
emojiFinder = `(?:\b)?:(` + emojiShortcode + `):(?:\b)?` // Extract all emoji shortcodes from a text.
emojiValidator = `^` + emojiShortcode + `$` // Validate a single emoji shortcode.
usernameStrict = `^[a-z0-9_]{1,64}$` // Pattern for usernames on THIS instance. maximumUsernameLength = 64
usernameRelaxed = `[a-z0-9_\.]{1,}` // Relaxed version of username that can match instance accounts too.
misskeyReportNotesFinder = `(?m)(?:^Note: ((?:http|https):\/\/.*)$)` // Extract reported Note URIs from the text of a Misskey report/flag.
ulid = `[0123456789ABCDEFGHJKMNPQRSTVWXYZ]{26}` // Pattern for ULID.
ulidValidate = `^` + ulid + `$` // Validate one ULID.
/*
Path parts / capture.
*/
userPathPrefix = `^/?` + users + `/(` + usernameRelaxed + `)`
userPath = userPathPrefix + `$`
userWebPathPrefix = `^/?` + `@(` + usernameRelaxed + `)`
userWebPath = userWebPathPrefix + `$`
publicKeyPath = userPathPrefix + `/` + publicKey + `$`
inboxPath = userPathPrefix + `/` + inbox + `$`
outboxPath = userPathPrefix + `/` + outbox + `$`
followersPath = userPathPrefix + `/` + followers + `$`
followingPath = userPathPrefix + `/` + following + `$`
likedPath = userPathPrefix + `/` + liked + `$`
followPath = userPathPrefix + `/` + follow + `/(` + ulid + `)$`
likePath = userPathPrefix + `/` + liked + `/(` + ulid + `)$`
statusesPath = userPathPrefix + `/` + statuses + `/(` + ulid + `)$`
acceptsPath = userPathPrefix + `/` + accepts + `/(` + ulid + `)$`
blockPath = userPathPrefix + `/` + blocks + `/(` + ulid + `)$`
reportPath = `^/?` + reports + `/(` + ulid + `)$`
filePath = `^/?(` + ulid + `)/([a-z]+)/([a-z]+)/(` + ulid + `)\.([a-z0-9]+)$`
2021-09-01 16:29:25 +00:00
)
var (
[bugfix] Fix existing bio text showing as HTML (#531) * fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim <grufwub@gmail.com> * go away linter Signed-off-by: kim <grufwub@gmail.com> * change buf reset location, change html mention tags Signed-off-by: kim <grufwub@gmail.com> * reduce FindLinks code complexity Signed-off-by: kim <grufwub@gmail.com> * fix HTML to text conversion Signed-off-by: kim <grufwub@gmail.com> * Update internal/regexes/regexes.go Co-authored-by: Mina Galić <mina.galic@puppet.com> * use improved html2text lib with more options Signed-off-by: kim <grufwub@gmail.com> * fix to produce actual plaintext from html Signed-off-by: kim <grufwub@gmail.com> * fix span tags instead written as space Signed-off-by: kim <grufwub@gmail.com> * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim <grufwub@gmail.com> * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim <grufwub@gmail.com> * use matched input string for link replace href text Signed-off-by: kim <grufwub@gmail.com> * remove unused code (to appease linter :sobs:) Signed-off-by: kim <grufwub@gmail.com> * improve hashtagFinger regex to be more compliant Signed-off-by: kim <grufwub@gmail.com> * update breakReplacer to include both unix and windows line endings Signed-off-by: kim <grufwub@gmail.com> * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim <grufwub@gmail.com> * drop unnecessary code Signed-off-by: kim <grufwub@gmail.com> * update text package tests to fix logic changes Signed-off-by: kim <grufwub@gmail.com> * add raw note content testing to account update and account verify Signed-off-by: kim <grufwub@gmail.com> * remove unused modules Signed-off-by: kim <grufwub@gmail.com> * fix emoji regex Signed-off-by: kim <grufwub@gmail.com> * fix replacement of hashtags Signed-off-by: kim <grufwub@gmail.com> * update code comment Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: Mina Galić <mina.galic@puppet.com>
2022-05-07 15:55:27 +00:00
// LinkScheme captures http/https schemes in URLs.
LinkScheme = func() *regexp.Regexp {
rgx, err := xurls.StrictMatchingScheme(schemes)
if err != nil {
panic(err)
}
return rgx
}()
// MentionName captures the username and domain part from
// a mention string such as @whatever_user@example.org,
// returning whatever_user and example.org (without the @ symbols).
// Will also work for characters with umlauts and other accents.
// See: https://regex101.com/r/9tjNUy/1 for explanation and examples.
2021-09-01 16:29:25 +00:00
MentionName = regexp.MustCompile(mentionName)
// MentionFinder extracts whole mentions from a piece of text.
2021-09-01 16:29:25 +00:00
MentionFinder = regexp.MustCompile(mentionFinder)
// EmojiValidator validates an emoji shortcode.
EmojiValidator = regexp.MustCompile(emojiValidator)
2021-09-01 16:29:25 +00:00
// EmojiFinder extracts emoji strings from a piece of text.
// See: https://regex101.com/r/478XGM/1
EmojiFinder = regexp.MustCompile(emojiFinder)
2021-09-01 16:29:25 +00:00
// Username can be used to validate usernames of new signups on this instance.
Username = regexp.MustCompile(usernameStrict)
2021-09-01 16:29:25 +00:00
// MisskeyReportNotes captures a list of Note URIs from report content created by Misskey.
// See: https://regex101.com/r/EnTOBV/1
MisskeyReportNotes = regexp.MustCompile(misskeyReportNotesFinder)
// UserPath validates and captures the username part from eg /users/example_username.
UserPath = regexp.MustCompile(userPath)
2021-09-01 16:29:25 +00:00
// UserWebPath validates and captures the username part from eg /@example_username.
UserWebPath = regexp.MustCompile(userWebPath)
2021-09-01 16:29:25 +00:00
// PublicKeyPath parses a path that validates and captures the username part from eg /users/example_username/main-key
PublicKeyPath = regexp.MustCompile(publicKeyPath)
// InboxPath parses a path that validates and captures the username part from eg /users/example_username/inbox
InboxPath = regexp.MustCompile(inboxPath)
// OutboxPath parses a path that validates and captures the username part from eg /users/example_username/outbox
OutboxPath = regexp.MustCompile(outboxPath)
// FollowersPath parses a path that validates and captures the username part from eg /users/example_username/followers
FollowersPath = regexp.MustCompile(followersPath)
// FollowingPath parses a path that validates and captures the username part from eg /users/example_username/following
FollowingPath = regexp.MustCompile(followingPath)
// LikedPath parses a path that validates and captures the username part from eg /users/example_username/liked
LikedPath = regexp.MustCompile(likedPath)
2021-09-01 16:29:25 +00:00
// ULID parses and validate a ULID.
ULID = regexp.MustCompile(ulidValidate)
2021-09-01 16:29:25 +00:00
// FollowPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/follow/01F7XT5JZW1WMVSW1KADS8PVDH
FollowPath = regexp.MustCompile(followPath)
2021-09-01 16:29:25 +00:00
// LikePath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/liked/01F7XT5JZW1WMVSW1KADS8PVDH
2021-09-01 16:29:25 +00:00
LikePath = regexp.MustCompile(likePath)
// StatusesPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/statuses/01F7XT5JZW1WMVSW1KADS8PVDH
// The regex can be played with here: https://regex101.com/r/G9zuxQ/1
StatusesPath = regexp.MustCompile(statusesPath)
// BlockPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH
BlockPath = regexp.MustCompile(blockPath)
// ReportPath parses a path that validates and captures the ulid part
// from eg /reports/01GP3AWY4CRDVRNZKW0TEAMB5R
ReportPath = regexp.MustCompile(reportPath)
// ReportPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/accepts/01GP3AWY4CRDVRNZKW0TEAMB5R
AcceptsPath = regexp.MustCompile(acceptsPath)
// FilePath parses a file storage path of the form [ACCOUNT_ID]/[MEDIA_TYPE]/[MEDIA_SIZE]/[FILE_NAME]
// eg 01F8MH1H7YV1Z7D2C8K2730QBF/attachment/small/01F8MH8RMYQ6MSNY3JM2XT1CQ5.jpeg
// It captures the account id, media type, media size, file name, and file extension, eg
// `01F8MH1H7YV1Z7D2C8K2730QBF`, `attachment`, `small`, `01F8MH8RMYQ6MSNY3JM2XT1CQ5`, `jpeg`.
FilePath = regexp.MustCompile(filePath)
2021-09-01 16:29:25 +00:00
)
[bugfix] Fix existing bio text showing as HTML (#531) * fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim <grufwub@gmail.com> * go away linter Signed-off-by: kim <grufwub@gmail.com> * change buf reset location, change html mention tags Signed-off-by: kim <grufwub@gmail.com> * reduce FindLinks code complexity Signed-off-by: kim <grufwub@gmail.com> * fix HTML to text conversion Signed-off-by: kim <grufwub@gmail.com> * Update internal/regexes/regexes.go Co-authored-by: Mina Galić <mina.galic@puppet.com> * use improved html2text lib with more options Signed-off-by: kim <grufwub@gmail.com> * fix to produce actual plaintext from html Signed-off-by: kim <grufwub@gmail.com> * fix span tags instead written as space Signed-off-by: kim <grufwub@gmail.com> * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim <grufwub@gmail.com> * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim <grufwub@gmail.com> * use matched input string for link replace href text Signed-off-by: kim <grufwub@gmail.com> * remove unused code (to appease linter :sobs:) Signed-off-by: kim <grufwub@gmail.com> * improve hashtagFinger regex to be more compliant Signed-off-by: kim <grufwub@gmail.com> * update breakReplacer to include both unix and windows line endings Signed-off-by: kim <grufwub@gmail.com> * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim <grufwub@gmail.com> * drop unnecessary code Signed-off-by: kim <grufwub@gmail.com> * update text package tests to fix logic changes Signed-off-by: kim <grufwub@gmail.com> * add raw note content testing to account update and account verify Signed-off-by: kim <grufwub@gmail.com> * remove unused modules Signed-off-by: kim <grufwub@gmail.com> * fix emoji regex Signed-off-by: kim <grufwub@gmail.com> * fix replacement of hashtags Signed-off-by: kim <grufwub@gmail.com> * update code comment Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: Mina Galić <mina.galic@puppet.com>
2022-05-07 15:55:27 +00:00
// bufpool is a memory pool of byte buffers for use in our regex utility functions.
var bufpool = sync.Pool{
New: func() any {
buf := bytes.NewBuffer(make([]byte, 0, 512))
return buf
},
}
// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes.
func ReplaceAllStringFunc(rgx *regexp.Regexp, src string, repl func(match string, buf *bytes.Buffer) string) string {
buf := bufpool.Get().(*bytes.Buffer) //nolint
defer bufpool.Put(buf)
return rgx.ReplaceAllStringFunc(src, func(match string) string {
buf.Reset() // reset use
return repl(match, buf)
})
}