improve span highlighting (#4480)
At first I thought simply changing the regex might help, but then I found more and more differences between Mastodon and Tusky, so I decided to reimplement the thing. I added 74 testcases that I all compared to Mastodon to make sure they are correct. On an Fairphone 4 the new implementation is faster, on an Samsung Galaxy Tab S3 slower. Testcases for the benchmark: ``` test of a status with #one hashtag http ``` ``` test http:// #hashtag https://connyduck.at/ http://example.org this is a #test and this is a @mention@test.com @test @test@test456@test.com ``` ``` @mention@test.social Just your ordinary mention with a hashtag #test ``` ``` @mention@test.social Just your ordinary mention with a url https://riot.im/app/#/room/#Tusky:matrix.org ``` FP4: ``` 11.159 ns 15 allocs Benchmark.new_1 119.701 ns 43 allocs Benchmark.new_2 21.895 ns 24 allocs Benchmark.new_3 87.512 ns 32 allocs Benchmark.new_4 16.592 ns 46 allocs Benchmark.old_1 134.381 ns 169 allocs Benchmark.old_2 28.355 ns 68 allocs Benchmark.old_3 45.221 ns 77 allocs Benchmark.old_4 ``` SGT3: ``` 43,785 ns 18 allocs Benchmark.new_1 446,074 ns 43 allocs Benchmark.new_2 78,802 ns 26 allocs Benchmark.new_3 315,478 ns 32 allocs Benchmark.new_4 42,186 ns 45 allocs Benchmark.old_1 353,570 ns 157 allocs Benchmark.old_2 72,376 ns 66 allocs Benchmark.old_3 122,985 ns 74 allocs Benchmark.old_4 ``` benchmark code is here: https://github.com/tuskyapp/tusky-span-benchmark closes https://github.com/tuskyapp/Tusky/issues/4425
This commit is contained in:
parent
e05ded2e32
commit
8aaca3bb2c
7 changed files with 2195 additions and 327 deletions
|
|
@ -9,105 +9,99 @@ import android.text.style.DynamicDrawableSpan
|
|||
import android.text.style.ForegroundColorSpan
|
||||
import android.text.style.ImageSpan
|
||||
import android.text.style.URLSpan
|
||||
import com.keylesspalace.tusky.util.twittertext.Regex
|
||||
import com.mikepenz.iconics.IconicsDrawable
|
||||
import com.mikepenz.iconics.typeface.library.googlematerial.GoogleMaterial
|
||||
import java.util.regex.Pattern
|
||||
import kotlin.math.max
|
||||
|
||||
/**
|
||||
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/tag.rb">
|
||||
* Tag#HASHTAG_RE</a>.
|
||||
*/
|
||||
private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c"
|
||||
private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions )
|
||||
private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))"
|
||||
private const val HASHTAG_SEPARATORS = "_\\u00B7\\u30FB\\u200c"
|
||||
internal const val TAG_PATTERN_STRING = "(?<![=/)\\p{Alnum}])(#(([\\w_][\\w$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][\\w$HASHTAG_SEPARATORS]*[\\w_])|([\\w_]*[\\p{Alpha}][\\w_]*)))"
|
||||
private val TAG_PATTERN = TAG_PATTERN_STRING.toPattern(Pattern.CASE_INSENSITIVE)
|
||||
|
||||
/**
|
||||
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/account.rb">
|
||||
* Account#MENTION_RE</a>
|
||||
*/
|
||||
private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?"
|
||||
private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)"
|
||||
private const val USERNAME_PATTERN_STRING = "[a-z0-9_]+([a-z0-9_.-]+[a-z0-9_]+)?"
|
||||
internal const val MENTION_PATTERN_STRING = "(?<![=/\\w])(@($USERNAME_PATTERN_STRING)(?:@[\\w.-]+[\\w]+)?)"
|
||||
private val MENTION_PATTERN = MENTION_PATTERN_STRING.toPattern(Pattern.CASE_INSENSITIVE)
|
||||
|
||||
private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)"
|
||||
private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)"
|
||||
|
||||
/**
|
||||
* Dump of android.util.Patterns.WEB_URL
|
||||
*/
|
||||
private val STRICT_WEB_URL_PATTERN = Pattern.compile(
|
||||
"(((?:(?i:http|https|rtsp)://(?:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?(?:(([a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]](?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]_\\-]{0,61}[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]]){0,1}\\.)+(xn\\-\\-[\\w\\-]{0,58}\\w|[a-zA-Z[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]]{2,63})|((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9]))))(?:\\:\\d{1,5})?)([/\\?](?:(?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]];/\\?:@&=#~\\-\\.\\+!\\*'\\(\\),_\\\$])|(?:%[a-fA-F0-9]{2}))*)?(?:\\b|\$|^))"
|
||||
)
|
||||
private val VALID_URL_PATTERN = Regex.VALID_URL_PATTERN_STRING.toPattern(Pattern.CASE_INSENSITIVE)
|
||||
|
||||
private val spanClasses = listOf(ForegroundColorSpan::class.java, URLSpan::class.java)
|
||||
private val finders = mapOf(
|
||||
FoundMatchType.HTTP_URL to PatternFinder(':', HTTP_URL_REGEX, 5, Character::isWhitespace),
|
||||
FoundMatchType.HTTPS_URL to PatternFinder(':', HTTPS_URL_REGEX, 6, Character::isWhitespace),
|
||||
FoundMatchType.TAG to PatternFinder('#', TAG_REGEX, 1, ::isValidForTagPrefix),
|
||||
// TODO: We also need a proper validator for mentions
|
||||
FoundMatchType.MENTION to PatternFinder('@', MENTION_REGEX, 1, Character::isWhitespace)
|
||||
|
||||
// url must come first, it may contain the other patterns
|
||||
val defaultFinders = listOf(
|
||||
PatternFinder("http", FoundMatchType.HTTP_URL, VALID_URL_PATTERN),
|
||||
PatternFinder("#", FoundMatchType.TAG, TAG_PATTERN),
|
||||
PatternFinder("@", FoundMatchType.MENTION, MENTION_PATTERN)
|
||||
)
|
||||
|
||||
private enum class FoundMatchType {
|
||||
enum class FoundMatchType {
|
||||
HTTP_URL,
|
||||
HTTPS_URL,
|
||||
TAG,
|
||||
MENTION
|
||||
}
|
||||
|
||||
private class FindCharsResult {
|
||||
lateinit var matchType: FoundMatchType
|
||||
var start: Int = -1
|
||||
var end: Int = -1
|
||||
}
|
||||
|
||||
private class PatternFinder(
|
||||
val searchCharacter: Char,
|
||||
regex: String,
|
||||
val searchPrefixWidth: Int,
|
||||
val prefixValidator: (Int) -> Boolean
|
||||
) {
|
||||
val pattern: Pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
||||
}
|
||||
class PatternFinder(
|
||||
val searchString: String,
|
||||
val type: FoundMatchType,
|
||||
val pattern: Pattern
|
||||
)
|
||||
|
||||
/**
|
||||
* Takes text containing mentions and hashtags and urls and makes them the given colour.
|
||||
* @param finders The finders to use. This is here so they can be overridden from unit tests.
|
||||
*/
|
||||
fun highlightSpans(text: Spannable, colour: Int) {
|
||||
fun Spannable.highlightSpans(colour: Int, finders: List<PatternFinder> = defaultFinders) {
|
||||
// Strip all existing colour spans.
|
||||
for (spanClass in spanClasses) {
|
||||
clearSpans(text, spanClass)
|
||||
clearSpans(spanClass)
|
||||
}
|
||||
|
||||
// Colour the mentions and hashtags.
|
||||
val string = text.toString()
|
||||
val length = text.length
|
||||
var start = 0
|
||||
var end = 0
|
||||
while (end in 0 until length && start >= 0) {
|
||||
// Search for url first because it can contain the other characters
|
||||
val found = findPattern(string, end)
|
||||
start = found.start
|
||||
end = found.end
|
||||
if (start in 0 until end) {
|
||||
text.setSpan(
|
||||
getSpan(found.matchType, string, colour, start, end),
|
||||
start,
|
||||
end,
|
||||
Spanned.SPAN_INCLUSIVE_EXCLUSIVE
|
||||
)
|
||||
start += finders[found.matchType]!!.searchPrefixWidth
|
||||
for (finder in finders) {
|
||||
// before running the regular expression, check if there is even a chance of it finding something
|
||||
if (this.contains(finder.searchString)) {
|
||||
val matcher = finder.pattern.matcher(this)
|
||||
|
||||
while (matcher.find()) {
|
||||
// we found a match
|
||||
val start = matcher.start(1)
|
||||
|
||||
val end = matcher.end(1)
|
||||
|
||||
// only add a span if there is no other one yet (e.g. the #anchor part of an url might match as hashtag, but must be ignored)
|
||||
if (this.getSpans(start, end, URLSpan::class.java).isEmpty()) {
|
||||
this.setSpan(
|
||||
getSpan(finder.type, this, colour, start, end),
|
||||
start,
|
||||
end,
|
||||
Spanned.SPAN_INCLUSIVE_EXCLUSIVE
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun <T> Spannable.clearSpans(spanClass: Class<T>) {
|
||||
for (span in getSpans(0, length, spanClass)) {
|
||||
removeSpan(span)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces text of the form [iconics name] with their spanned counterparts (ImageSpan).
|
||||
*/
|
||||
fun addDrawables(text: CharSequence, color: Int, size: Int, context: Context): Spannable {
|
||||
val builder = SpannableStringBuilder(text)
|
||||
|
||||
val pattern = Pattern.compile("\\[iconics ([0-9a-z_]+)\\]")
|
||||
val pattern = Pattern.compile("\\[iconics ([0-9a-z_]+)]")
|
||||
val matcher = pattern.matcher(builder)
|
||||
while (matcher.find()) {
|
||||
val resourceName = matcher.group(1)
|
||||
|
|
@ -123,98 +117,16 @@ fun addDrawables(text: CharSequence, color: Int, size: Int, context: Context): S
|
|||
return builder
|
||||
}
|
||||
|
||||
private fun <T> clearSpans(text: Spannable, spanClass: Class<T>) {
|
||||
for (span in text.getSpans(0, text.length, spanClass)) {
|
||||
text.removeSpan(span)
|
||||
}
|
||||
}
|
||||
|
||||
private fun findPattern(string: String, fromIndex: Int): FindCharsResult {
|
||||
val result = FindCharsResult()
|
||||
for (i in fromIndex..string.lastIndex) {
|
||||
val c = string[i]
|
||||
for (matchType in FoundMatchType.entries) {
|
||||
val finder = finders[matchType]
|
||||
if (finder!!.searchCharacter == c &&
|
||||
(
|
||||
(i - fromIndex) < finder.searchPrefixWidth ||
|
||||
finder.prefixValidator(string.codePointAt(i - finder.searchPrefixWidth))
|
||||
)
|
||||
) {
|
||||
result.matchType = matchType
|
||||
result.start = max(0, i - finder.searchPrefixWidth)
|
||||
findEndOfPattern(string, result, finder.pattern)
|
||||
if (result.start + finder.searchPrefixWidth <= i + 1 && // The found result is actually triggered by the correct search character
|
||||
result.end >= result.start
|
||||
) { // ...and we actually found a valid result
|
||||
return result
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
private fun findEndOfPattern(string: String, result: FindCharsResult, pattern: Pattern) {
|
||||
val matcher = pattern.matcher(string)
|
||||
if (matcher.find(result.start)) {
|
||||
// Once we have API level 26+, we can use named captures...
|
||||
val end = matcher.end()
|
||||
result.start = matcher.start()
|
||||
when (result.matchType) {
|
||||
FoundMatchType.TAG -> {
|
||||
if (isValidForTagPrefix(string.codePointAt(result.start))) {
|
||||
if (string[result.start] != '#' ||
|
||||
(string[result.start] == '#' && string[result.start + 1] == '#')
|
||||
) {
|
||||
++result.start
|
||||
}
|
||||
}
|
||||
}
|
||||
else -> {
|
||||
if (Character.isWhitespace(string.codePointAt(result.start))) {
|
||||
++result.start
|
||||
}
|
||||
}
|
||||
}
|
||||
when (result.matchType) {
|
||||
FoundMatchType.HTTP_URL, FoundMatchType.HTTPS_URL -> {
|
||||
// Preliminary url patterns are fast/permissive, now we'll do full validation
|
||||
if (STRICT_WEB_URL_PATTERN.matcher(string.substring(result.start, end)).matches()) {
|
||||
result.end = end
|
||||
}
|
||||
}
|
||||
else -> result.end = end
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun getSpan(
|
||||
matchType: FoundMatchType,
|
||||
string: String,
|
||||
string: CharSequence,
|
||||
colour: Int,
|
||||
start: Int,
|
||||
end: Int
|
||||
): CharacterStyle {
|
||||
return when (matchType) {
|
||||
FoundMatchType.HTTP_URL -> NoUnderlineURLSpan(string.substring(start, end))
|
||||
FoundMatchType.HTTPS_URL -> NoUnderlineURLSpan(string.substring(start, end))
|
||||
FoundMatchType.HTTP_URL, FoundMatchType.HTTPS_URL -> NoUnderlineURLSpan(string.substring(start, end))
|
||||
FoundMatchType.MENTION -> MentionSpan(string.substring(start, end))
|
||||
else -> ForegroundColorSpan(colour)
|
||||
}
|
||||
}
|
||||
|
||||
private fun isWordCharacters(codePoint: Int): Boolean {
|
||||
return (codePoint in 0x30..0x39) || // [0-9]
|
||||
(codePoint in 0x41..0x5a) || // [A-Z]
|
||||
(codePoint == 0x5f) || // _
|
||||
(codePoint in 0x61..0x7a) // [a-z]
|
||||
}
|
||||
|
||||
private fun isValidForTagPrefix(codePoint: Int): Boolean {
|
||||
return !(
|
||||
isWordCharacters(codePoint) || // \w
|
||||
(codePoint == 0x2f) || // /
|
||||
(codePoint == 0x29)
|
||||
) // )
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue