From 9477fd7eba9bda6813b65c6c54380904892ca35e Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:03:00 +0100 Subject: [PATCH] [feature] Allow partial-word hashtags using non-breaking spaces (#3606) * [feature] Allow partial-word hashtags using non-breaking spaces * update docs --- docs/user_guide/posts.md | 3 +++ internal/text/plain_test.go | 13 +++++++++++++ internal/text/util.go | 30 ++++++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/posts.md b/docs/user_guide/posts.md index 1f718cfae..c45ad4bcb 100644 --- a/docs/user_guide/posts.md +++ b/docs/user_guide/posts.md @@ -285,6 +285,9 @@ For accessibility reasons, it is considerate to use upper camel case when you're You can include as many hashtags as you like within a GoToSocial post, and each hashtag has a length limit of 100 characters. +!!! tip + To end a hashtag, you can simply use a space, for example in the text `this #soup rules`, the hashtag is terminated by a space so `#soup` becomes the hashtag. However, you can also use a pipe character `|`, or the unicode characters `\u200B` (zero-width no-break space) or `\uFEFF` (zero-width space), to create "partial-word" hashtags. For example, with input text `this #so|up rules`, only the `#so` part becomes the hashtag. Likewise, with the input text `this #soup rules`, which contains an invisible zero-width space after the o and before the u, only the `#so` part becomes the hashtag. See here for more information on zero-width spaces: https://en.wikipedia.org/wiki/Zero-width_space. + ## Input Sanitization In order not to spread scripts, vulnerabilities, and glitchy HTML all over the place, GoToSocial performs the following types of input sanitization: diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index 48280bb44..fac54a38e 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -36,6 +36,8 @@ moreComplexExpected = "
Another test @foss_satan
#Hashtag
Text
:rainbow:
here's a link with utf-8 characters in it: https://example.org/söme_url
" + withFunkyTags = "#hashtag1 pee #hashtag2\u200Bpee #hashtag3|poo #hashtag4\uFEFFpoo" + withFunkyTagsExpected = "#hashtag1 pee #hashtag2\u200bpee #hashtag3|poo #hashtag4\ufeffpoo
" ) type PlainTestSuite struct { @@ -136,6 +138,17 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() { suite.Equal("올빼미", tags[0].Name) } +func (suite *PlainTestSuite) TestFunkyTags() { + formatted := suite.FromPlain(withFunkyTags) + suite.Equal(withFunkyTagsExpected, formatted.HTML) + + tags := formatted.Tags + suite.Equal("hashtag1", tags[0].Name) + suite.Equal("hashtag2", tags[1].Name) + suite.Equal("hashtag3", tags[2].Name) + suite.Equal("hashtag4", tags[3].Name) +} + func (suite *PlainTestSuite) TestDeriveMultiple() { statusText := `Another test @foss_satan@fossbros-anonymous.io diff --git a/internal/text/util.go b/internal/text/util.go index 204c64838..af45cfaf0 100644 --- a/internal/text/util.go +++ b/internal/text/util.go @@ -38,8 +38,34 @@ func isPermittedInHashtag(r rune) bool { // is a recognized break character for before // or after a #hashtag. func isHashtagBoundary(r rune) bool { - return unicode.IsSpace(r) || - (unicode.IsPunct(r) && r != '_') + switch { + + // Zero width space. + case r == '\u200B': + return true + + // Zero width no-break space. + case r == '\uFEFF': + return true + + // Pipe character sometimes + // used as workaround. + case r == '|': + return true + + // Standard Unicode white space. + case unicode.IsSpace(r): + return true + + // Non-underscore punctuation. + case unicode.IsPunct(r) && r != '_': + return true + + // Not recognized + // hashtag boundary. + default: + return false + } } // isMentionBoundary returns true if rune r