improve span highlighting (#4480)

At first I thought simply changing the regex might help, but then I
found more and more differences between Mastodon and Tusky, so I decided
to reimplement the thing. I added 74 testcases that I all compared to
Mastodon to make sure they are correct.

On an Fairphone 4 the new implementation is faster, on an Samsung Galaxy
Tab S3 slower.

Testcases for the benchmark:
```
test of a status with #one hashtag http
```
```
test
http:// #hashtag https://connyduck.at/
http://example.org
this is a #test
and this is a @mention@test.com @test @test@test456@test.com
```
```
@mention@test.social Just your ordinary mention with a hashtag
#test
```
```
@mention@test.social Just your ordinary mention with a url
https://riot.im/app/#/room/#Tusky:matrix.org
```



FP4:
```
       11.159   ns          15 allocs    Benchmark.new_1
      119.701   ns          43 allocs    Benchmark.new_2
       21.895   ns          24 allocs    Benchmark.new_3
       87.512   ns          32 allocs    Benchmark.new_4

       16.592   ns          46 allocs    Benchmark.old_1
      134.381   ns         169 allocs    Benchmark.old_2
       28.355   ns          68 allocs    Benchmark.old_3
       45.221   ns          77 allocs    Benchmark.old_4
```

SGT3:
```
       43,785   ns          18 allocs    Benchmark.new_1
      446,074   ns          43 allocs    Benchmark.new_2
       78,802   ns          26 allocs    Benchmark.new_3
      315,478   ns          32 allocs    Benchmark.new_4

       42,186   ns          45 allocs    Benchmark.old_1
      353,570   ns         157 allocs    Benchmark.old_2
       72,376   ns          66 allocs    Benchmark.old_3
      122,985   ns          74 allocs    Benchmark.old_4
```


benchmark code is here: https://github.com/tuskyapp/tusky-span-benchmark

closes https://github.com/tuskyapp/Tusky/issues/4425
This commit is contained in:
Konrad Pozniak 2024-06-02 16:32:58 +02:00 committed by GitHub
commit 8aaca3bb2c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 2195 additions and 327 deletions

View file

@ -1,182 +1,187 @@
package com.keylesspalace.tusky
import android.text.Spannable
import com.keylesspalace.tusky.util.FoundMatchType
import com.keylesspalace.tusky.util.MENTION_PATTERN_STRING
import com.keylesspalace.tusky.util.PatternFinder
import com.keylesspalace.tusky.util.TAG_PATTERN_STRING
import com.keylesspalace.tusky.util.highlightSpans
import org.junit.Assert
import com.keylesspalace.tusky.util.twittertext.Regex
import java.util.regex.Pattern
import org.junit.Assert.assertEquals
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.Parameterized
class SpanUtilsTest {
@Test
fun matchesMixedSpans() {
val input = "one #one two: @two three : https://thr.ee/meh?foo=bar&wat=@at#hmm four #four five @five ろく#six"
val inputSpannable = FakeSpannable(input)
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(6, spans.size)
/** The [Pattern.UNICODE_CHARACTER_CLASS] flag is not supported on Android, on Android it is just always on.
* Since thesse tests run on a regular Jvm, we need a to set this flag or they would behave differently.
* */
private val urlPattern = Regex.VALID_URL_PATTERN_STRING.toPattern(Pattern.CASE_INSENSITIVE or Pattern.UNICODE_CHARACTER_CLASS)
private val tagPattern = TAG_PATTERN_STRING.toPattern(Pattern.CASE_INSENSITIVE or Pattern.UNICODE_CHARACTER_CLASS)
private val mentionPattern = MENTION_PATTERN_STRING.toPattern(Pattern.CASE_INSENSITIVE or Pattern.UNICODE_CHARACTER_CLASS)
val finders = listOf(
PatternFinder("http", FoundMatchType.HTTPS_URL, urlPattern),
PatternFinder("#", FoundMatchType.TAG, tagPattern),
PatternFinder("@", FoundMatchType.MENTION, mentionPattern)
)
@RunWith(Parameterized::class)
class SpanUtilsTest(
private val stringToHighlight: String,
private val highlights: List<Pair<Int, Int>>
) {
companion object {
@Parameterized.Parameters(name = "{0}")
@JvmStatic
fun data() = listOf(
arrayOf("@mention", listOf(0 to 8)),
arrayOf("@mention@server.com", listOf(0 to 19)),
arrayOf("#tag", listOf(0 to 4)),
arrayOf("#tåg", listOf(0 to 4)),
arrayOf("https://thr.ee/meh?foo=bar&wat=@at#hmm", listOf(0 to 38)),
arrayOf("http://thr.ee/meh?foo=bar&wat=@at#hmm", listOf(0 to 37)),
arrayOf(
"one #one two: @two three : https://thr.ee/meh?foo=bar&wat=@at#hmm four #four five @five 6 #six",
listOf(4 to 8, 14 to 18, 27 to 65, 71 to 76, 82 to 87, 90 to 94)
),
arrayOf("http://first.link https://second.link", listOf(0 to 17, 18 to 37)),
arrayOf("#test", listOf(0 to 5)),
arrayOf(" #AfterSpace", listOf(1 to 12)),
arrayOf("#BeforeSpace ", listOf(0 to 12)),
arrayOf("@#after_at", listOf(1 to 10)),
arrayOf("あいうえお#after_hiragana", listOf<Pair<Int, Int>>()),
arrayOf("##DoubleHash", listOf(1 to 12)),
arrayOf("###TripleHash", listOf(2 to 13)),
arrayOf("something#notAHashtag", listOf<Pair<Int, Int>>()),
arrayOf("test##maybeAHashtag", listOf(5 to 19)),
arrayOf("testhttp://not.a.url.com", listOf<Pair<Int, Int>>()),
arrayOf("test@notAMention", listOf<Pair<Int, Int>>()),
arrayOf("test@notAMention#notAHashtag", listOf<Pair<Int, Int>>()),
arrayOf("test@notAMention@server.com", listOf<Pair<Int, Int>>()),
// Mastodon will not highlight this mention, although it would be valid according to their regex
// arrayOf("@test@notAMention@server.com", listOf<Pair<Int, Int>>()),
arrayOf("testhttps://not.a.url.com", listOf<Pair<Int, Int>>()),
arrayOf("#hashtag1", listOf(0 to 9)),
arrayOf("#1hashtag", listOf(0 to 9)),
arrayOf("#サイクリング", listOf(0 to 7)),
arrayOf("#自転車に乗る", listOf(0 to 7)),
arrayOf("(#test)", listOf(1 to 6)),
arrayOf(")#test(", listOf<Pair<Int, Int>>()),
arrayOf("{#test}", listOf(1 to 6)),
arrayOf("[#test]", listOf(1 to 6)),
arrayOf("}#test{", listOf(1 to 6)),
arrayOf("]#test[", listOf(1 to 6)),
arrayOf("<#test>", listOf(1 to 6)),
arrayOf(">#test<", listOf(1 to 6)),
arrayOf("((#Test))", listOf(2 to 7)),
arrayOf("((##Te)st)", listOf(3 to 6)),
arrayOf("[@ConnyDuck]", listOf(1 to 11)),
arrayOf("(@ConnyDuck)", listOf(1 to 11)),
arrayOf("(@ConnyDuck@chaos.social)", listOf(1 to 24)),
arrayOf("Test(https://test.xyz/blubb(test)))))))))))", listOf(5 to 33)),
arrayOf("Test https://test.xyz/blubb(test)))))))))))", listOf(5 to 33)),
arrayOf("Test https://test.xyz/blubbtest)))))))))))", listOf(5 to 31)),
arrayOf("#https://test.com", listOf(0 to 6)),
arrayOf("#https://t", listOf(0 to 6)),
arrayOf("(https://blubb.com", listOf(1 to 18)),
arrayOf("https://example.com/path#anchor", listOf(0 to 31)),
arrayOf("test httpx2345://wrong.protocol.com", listOf<Pair<Int, Int>>()),
arrayOf("test https://nonexistent.topleveldomain.testtest", listOf<Pair<Int, Int>>()),
arrayOf("test https://example.com:1234 domain with port", listOf(5 to 29)),
arrayOf("http://1.1.1.1", listOf<Pair<Int, Int>>()),
arrayOf("http://foo.bar/?q=Test%20URL-encoded%20stuff", listOf(0 to 44)),
arrayOf("http://userid:password@example.com", listOf<Pair<Int, Int>>()),
arrayOf("http://userid@example.com", listOf<Pair<Int, Int>>()),
arrayOf("http://foo.com/blah_blah_(brackets)_(again)", listOf(0 to 43)),
arrayOf("test example.com/no/protocol", listOf<Pair<Int, Int>>()),
arrayOf("protocol only https://", listOf<Pair<Int, Int>>()),
arrayOf("no tld https://test", listOf<Pair<Int, Int>>()),
arrayOf("mention in url https://test.com/@test@domain.cat", listOf(15 to 48)),
arrayOf("#hash_tag", listOf(0 to 9)),
arrayOf("#hashtag_", listOf(0 to 9)),
arrayOf("#hashtag_#tag", listOf(0 to 9, 9 to 13)),
arrayOf("#hash_tag#tag", listOf(0 to 9)),
arrayOf("_#hashtag", listOf(1 to 9)),
arrayOf("@@ConnyDuck@chaos.social", listOf(1 to 24)),
arrayOf("http://https://connyduck.at", listOf(7 to 27)),
arrayOf("https://https://connyduck.at", listOf(8 to 28)),
arrayOf("http:// http://connyduck.at", listOf(8 to 27)),
arrayOf("https:// https://connyduck.at", listOf(9 to 29)),
arrayOf("https:// #test https://connyduck.at", listOf(9 to 14, 15 to 35)),
arrayOf("http:// @connyduck http://connyduck.at", listOf(8 to 18, 19 to 38)),
// emojis count as multiple characters
arrayOf("😜https://connyduck.at", listOf(2 to 22)),
arrayOf("😜#tag", listOf(2 to 6)),
arrayOf("😜@user@mastodon.example", listOf(2 to 24)),
)
}
@Test
fun doesntMergeAdjacentURLs() {
val firstURL = "http://first.thing"
val secondURL = "https://second.thing"
val inputSpannable = FakeSpannable("$firstURL $secondURL")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(2, spans.size)
Assert.assertEquals(firstURL.length, spans[0].end - spans[0].start)
Assert.assertEquals(secondURL.length, spans[1].end - spans[1].start)
}
fun testHighlighting() {
val inputSpannable = FakeSpannable(stringToHighlight)
inputSpannable.highlightSpans(0xffffff, finders)
@RunWith(Parameterized::class)
class MatchingTests(private val thingToHighlight: String) {
companion object {
@Parameterized.Parameters(name = "{0}")
@JvmStatic
fun data(): Iterable<Any> {
return listOf(
"@mention",
"#tag",
"#tåg",
"https://thr.ee/meh?foo=bar&wat=@at#hmm",
"http://thr.ee/meh?foo=bar&wat=@at#hmm"
)
assertEquals(highlights.size, inputSpannable.spans.size)
inputSpannable.spans
.sortedBy { span -> span.start }
.forEachIndexed { index, span ->
assertEquals(highlights[index].first, span.start)
assertEquals(highlights[index].second, span.end)
}
}
@Test
fun matchesSpanAtStart() {
val inputSpannable = FakeSpannable(thingToHighlight)
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(1, spans.size)
Assert.assertEquals(thingToHighlight.length, spans[0].end - spans[0].start)
}
@Test
fun matchesSpanNotAtStart() {
val inputSpannable = FakeSpannable(" $thingToHighlight")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(1, spans.size)
Assert.assertEquals(thingToHighlight.length, spans[0].end - spans[0].start)
}
@Test
fun doesNotMatchSpanEmbeddedInText() {
val inputSpannable = FakeSpannable("aa${thingToHighlight}aa")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertTrue(spans.isEmpty())
}
@Test
fun doesNotMatchSpanEmbeddedInAnotherSpan() {
val inputSpannable = FakeSpannable("@aa${thingToHighlight}aa")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(1, spans.size)
}
@Test
fun spansDoNotOverlap() {
val begin = "@begin"
val end = "#end"
val inputSpannable = FakeSpannable("$begin $thingToHighlight $end")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(3, spans.size)
val middleSpan = spans.single { span -> span.start > 0 && span.end < inputSpannable.lastIndex }
Assert.assertEquals(begin.length + 1, middleSpan.start)
Assert.assertEquals(inputSpannable.length - end.length - 1, middleSpan.end)
}
}
@RunWith(Parameterized::class)
class HighlightingTestsForTag(
private val text: String,
private val expectedStartIndex: Int,
private val expectedEndIndex: Int
) {
companion object {
@Parameterized.Parameters(name = "{0}")
@JvmStatic
fun data(): Iterable<Any> {
return listOf(
arrayOf("#test", 0, 5),
arrayOf(" #AfterSpace", 1, 12),
arrayOf("#BeforeSpace ", 0, 12),
arrayOf("@#after_at", 1, 10),
arrayOf("あいうえお#after_hiragana", 5, 20),
arrayOf("##DoubleHash", 1, 12),
arrayOf("###TripleHash", 2, 13)
)
}
}
@Test
fun matchExpectations() {
val inputSpannable = FakeSpannable(text)
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(1, spans.size)
val span = spans.first()
Assert.assertEquals(expectedStartIndex, span.start)
Assert.assertEquals(expectedEndIndex, span.end)
}
}
class FakeSpannable(private val text: String) : Spannable {
val spans = mutableListOf<BoundedSpan>()
override fun setSpan(what: Any?, start: Int, end: Int, flags: Int) {
spans.add(BoundedSpan(what, start, end))
}
@Suppress("UNCHECKED_CAST")
override fun <T : Any> getSpans(start: Int, end: Int, type: Class<T>): Array<T> {
return spans.filter { it.start >= start && it.end <= end && type.isInstance(it.span) }
.map { it.span }
.toTypedArray() as Array<T>
}
override fun removeSpan(what: Any?) {
spans.removeIf { span -> span.span == what }
}
override fun toString(): String {
return text
}
override val length: Int
get() = text.length
class BoundedSpan(val span: Any?, val start: Int, val end: Int)
override fun nextSpanTransition(start: Int, limit: Int, type: Class<*>?): Int {
throw NotImplementedError()
}
override fun getSpanEnd(tag: Any?): Int {
throw NotImplementedError()
}
override fun getSpanFlags(tag: Any?): Int {
throw NotImplementedError()
}
override fun get(index: Int): Char {
throw NotImplementedError()
}
override fun subSequence(startIndex: Int, endIndex: Int): CharSequence {
throw NotImplementedError()
}
override fun getSpanStart(tag: Any?): Int {
throw NotImplementedError()
}
}
}
class FakeSpannable(private val text: String) : Spannable {
val spans = mutableListOf<BoundedSpan>()
override fun setSpan(what: Any?, start: Int, end: Int, flags: Int) {
spans.add(BoundedSpan(what, start, end))
}
@Suppress("UNCHECKED_CAST")
override fun <T : Any> getSpans(start: Int, end: Int, type: Class<T>): Array<T> {
return spans.filter { it.start >= start && it.end <= end && type.isInstance(it.span) }
.map { it.span }
.toTypedArray() as Array<T>
}
override fun removeSpan(what: Any?) {
spans.removeIf { span -> span.span == what }
}
override fun toString(): String {
return text
}
override val length: Int
get() = text.length
class BoundedSpan(val span: Any?, val start: Int, val end: Int)
override fun nextSpanTransition(start: Int, limit: Int, type: Class<*>?): Int {
throw NotImplementedError()
}
override fun getSpanEnd(tag: Any?): Int {
throw NotImplementedError()
}
override fun getSpanFlags(tag: Any?): Int {
throw NotImplementedError()
}
override fun get(index: Int): Char {
return text[index]
}
override fun subSequence(startIndex: Int, endIndex: Int): CharSequence {
return text.subSequence(startIndex, endIndex)
}
override fun getSpanStart(tag: Any?): Int {
throw NotImplementedError()
}
}