mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-02-04 20:05:05 +00:00
[bugfix] Extend parser to handle more non-Latin hashtags (#3700)
* Allow marks after NFC normalization Includes regression test for the Tamil example from #3618 * Disallow just numbers + marks + underscore as hashtag
This commit is contained in:
parent
ab758cc233
commit
b9e0689359
|
@ -177,7 +177,7 @@ func (p *hashtagParser) Parse(
|
||||||
// Ignore initial '#'.
|
// Ignore initial '#'.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
case !isPlausiblyInHashtag(r) &&
|
case !isPermittedInHashtag(r) &&
|
||||||
!isHashtagBoundary(r):
|
!isHashtagBoundary(r):
|
||||||
// Weird non-boundary character
|
// Weird non-boundary character
|
||||||
// in the hashtag. Don't trust it.
|
// in the hashtag. Don't trust it.
|
||||||
|
|
|
@ -50,6 +50,8 @@
|
||||||
withInlineCode2Expected = "<p><code>Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?</code></p>"
|
withInlineCode2Expected = "<p><code>Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?</code></p>"
|
||||||
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
|
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
|
||||||
withHashtagExpected = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
|
withHashtagExpected = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
|
||||||
|
withTamilHashtag = "here's a simple status that uses a hashtag in Tamil #தமிழ்"
|
||||||
|
withTamilHashtagExpected = "<p>here's a simple status that uses a hashtag in Tamil <a href=\"http://localhost:8080/tags/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>தமிழ்</span></a></p>"
|
||||||
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
|
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
|
||||||
mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"
|
mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"
|
||||||
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
|
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
|
||||||
|
@ -121,6 +123,12 @@ func (suite *MarkdownTestSuite) TestParseWithHashtag() {
|
||||||
suite.Equal(withHashtagExpected, formatted.HTML)
|
suite.Equal(withHashtagExpected, formatted.HTML)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Regressiom test for https://github.com/superseriousbusiness/gotosocial/issues/3618
|
||||||
|
func (suite *MarkdownTestSuite) TestParseWithTamilHashtag() {
|
||||||
|
formatted := suite.FromMarkdown(withTamilHashtag)
|
||||||
|
suite.Equal(withTamilHashtagExpected, formatted.HTML)
|
||||||
|
}
|
||||||
|
|
||||||
func (suite *MarkdownTestSuite) TestParseWithHTML() {
|
func (suite *MarkdownTestSuite) TestParseWithHTML() {
|
||||||
formatted := suite.FromMarkdown(mdWithHTML)
|
formatted := suite.FromMarkdown(mdWithHTML)
|
||||||
suite.Equal(mdWithHTMLExpected, formatted.HTML)
|
suite.Equal(mdWithHTMLExpected, formatted.HTML)
|
||||||
|
|
|
@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) {
|
||||||
|
|
||||||
// Validate normalized result.
|
// Validate normalized result.
|
||||||
var (
|
var (
|
||||||
notJustUnderscores = false
|
atLeastOneRequiredChar = false
|
||||||
onlyPermittedChars = true
|
onlyPermittedChars = true
|
||||||
lengthOK = true
|
lengthOK = true
|
||||||
)
|
)
|
||||||
|
|
||||||
for i, r := range normalized {
|
for i, r := range normalized {
|
||||||
if r != '_' {
|
if !isPermittedIfNotEntireHashtag(r) {
|
||||||
// This isn't an underscore,
|
// This isn't an underscore, mark, etc,
|
||||||
// so the whole hashtag isn't
|
// so the hashtag contains at least one
|
||||||
// just underscores.
|
atLeastOneRequiredChar = true
|
||||||
notJustUnderscores = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if i >= maximumHashtagLength {
|
if i >= maximumHashtagLength {
|
||||||
|
@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores)
|
return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar
|
||||||
}
|
}
|
||||||
|
|
|
@ -118,20 +118,20 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() {
|
||||||
`
|
`
|
||||||
|
|
||||||
tags := suite.FromPlain(statusText).Tags
|
tags := suite.FromPlain(statusText).Tags
|
||||||
suite.Len(tags, 13)
|
if suite.Len(tags, 12) {
|
||||||
suite.Equal("testing123", tags[0].Name)
|
suite.Equal("testing123", tags[0].Name)
|
||||||
suite.Equal("also", tags[1].Name)
|
suite.Equal("also", tags[1].Name)
|
||||||
suite.Equal("thisshouldwork", tags[2].Name)
|
suite.Equal("thisshouldwork", tags[2].Name)
|
||||||
suite.Equal("dupe", tags[3].Name)
|
suite.Equal("dupe", tags[3].Name)
|
||||||
suite.Equal("ThisShouldAlsoWork", tags[4].Name)
|
suite.Equal("ThisShouldAlsoWork", tags[4].Name)
|
||||||
suite.Equal("this_should_not_be_split", tags[5].Name)
|
suite.Equal("this_should_not_be_split", tags[5].Name)
|
||||||
suite.Equal("111111", tags[6].Name)
|
suite.Equal("alimentación", tags[6].Name)
|
||||||
suite.Equal("alimentación", tags[7].Name)
|
suite.Equal("saúde", tags[7].Name)
|
||||||
suite.Equal("saúde", tags[8].Name)
|
suite.Equal("lävistää", tags[8].Name)
|
||||||
suite.Equal("lävistää", tags[9].Name)
|
suite.Equal("ö", tags[9].Name)
|
||||||
suite.Equal("ö", tags[10].Name)
|
suite.Equal("네", tags[10].Name)
|
||||||
suite.Equal("네", tags[11].Name)
|
suite.Equal("ThisOneIsThirteyCharactersLong", tags[11].Name)
|
||||||
suite.Equal("ThisOneIsThirteyCharactersLong", tags[12].Name)
|
}
|
||||||
|
|
||||||
statusText = `#올빼미 hej`
|
statusText = `#올빼미 hej`
|
||||||
tags = suite.FromPlain(statusText).Tags
|
tags = suite.FromPlain(statusText).Tags
|
||||||
|
@ -170,8 +170,17 @@ func (suite *PlainTestSuite) TestDeriveMultiple() {
|
||||||
func (suite *PlainTestSuite) TestZalgoHashtag() {
|
func (suite *PlainTestSuite) TestZalgoHashtag() {
|
||||||
statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?`
|
statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?`
|
||||||
f := suite.FromPlain(statusText)
|
f := suite.FromPlain(statusText)
|
||||||
suite.Len(f.Tags, 1)
|
if suite.Len(f.Tags, 2) {
|
||||||
suite.Equal("praying", f.Tags[0].Name)
|
suite.Equal("praying", f.Tags[0].Name)
|
||||||
|
// NFC doesn't do much for Zalgo text, but it's difficult to strip marks without affecting non-Latin text.
|
||||||
|
suite.Equal("z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪", f.Tags[1].Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suite *PlainTestSuite) TestNumbersAreNotHashtags() {
|
||||||
|
statusText := `yo who else thinks #19_98 is #1?`
|
||||||
|
f := suite.FromPlain(statusText)
|
||||||
|
suite.Len(f.Tags, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPlainTestSuite(t *testing.T) {
|
func TestPlainTestSuite(t *testing.T) {
|
||||||
|
|
|
@ -19,19 +19,14 @@
|
||||||
|
|
||||||
import "unicode"
|
import "unicode"
|
||||||
|
|
||||||
func isPlausiblyInHashtag(r rune) bool {
|
func isPermittedInHashtag(r rune) bool {
|
||||||
// Marks are allowed during parsing
|
return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r)
|
||||||
// prior to normalization, but not after,
|
|
||||||
// since they may be combined into letters
|
|
||||||
// during normalization.
|
|
||||||
return unicode.IsMark(r) ||
|
|
||||||
isPermittedInHashtag(r)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isPermittedInHashtag(r rune) bool {
|
// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag
|
||||||
return unicode.IsLetter(r) ||
|
// but are not allowed to be the only characters making up the hashtag.
|
||||||
unicode.IsNumber(r) ||
|
func isPermittedIfNotEntireHashtag(r rune) bool {
|
||||||
r == '_'
|
return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_'
|
||||||
}
|
}
|
||||||
|
|
||||||
// isHashtagBoundary returns true if rune r
|
// isHashtagBoundary returns true if rune r
|
||||||
|
|
Loading…
Reference in a new issue