Fix the tag span generation for tags with nonascii characters (#2700)

* Update mention and tag regexes from mastodon * Normalize nonascii tag names the same way that mastodon does
2022-09-17 19:06:45 +02:00 · 2022-09-17 19:06:45 +02:00 · 5d09a67b52
commit 5d09a67b52
parent 687cffd540
5 changed files with 44 additions and 3 deletions
--- a/app/src/main/java/com/keylesspalace/tusky/util/AsciiFolding.kt
+++ b/app/src/main/java/com/keylesspalace/tusky/util/AsciiFolding.kt
@ -0,0 +1,26 @@
+/* Copyright 2022 Tusky contributors
+ *
+ * This file is a part of Tusky.
+ *
+ * This program is free software; you can redistribute it and/or modify it under the terms of the
+ * GNU General Public License as published by the Free Software Foundation; either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * Tusky is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
+ * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+ * Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with Tusky; if not,
+ * see <http://www.gnu.org/licenses>. */
+
+package com.keylesspalace.tusky.util
+
+// Inspired by https://github.com/mastodon/mastodon/blob/main/app/lib/ascii_folding.rb
+
+val unicodeToASCIIMap = "ÀÁÂÃÄÅàáâãäåĀāĂăĄąÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňŉŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž".toList().zip(
+    "AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz".toList()
+).toMap()
+
+fun normalizeToASCII(text: CharSequence): CharSequence {
+    return String(text.map { unicodeToASCIIMap[it] ?: it }.toCharArray())
+}
--- a/app/src/main/java/com/keylesspalace/tusky/util/LinkHelper.kt
+++ b/app/src/main/java/com/keylesspalace/tusky/util/LinkHelper.kt
@ -124,7 +124,7 @@ fun setClickableText(

@VisibleForTesting
 fun getTagName(text: CharSequence, tags: List<HashTag>?): String? {
-    val scrapedName = text.subSequence(1, text.length).toString()
+    val scrapedName = normalizeToASCII(text.subSequence(1, text.length)).toString()
    return when (tags) {
        null -> scrapedName
        else -> tags.firstOrNull { it.name.equals(scrapedName, true) }?.name
--- a/app/src/main/java/com/keylesspalace/tusky/util/SpanUtils.kt
+++ b/app/src/main/java/com/keylesspalace/tusky/util/SpanUtils.kt
@ -12,13 +12,16 @@ import kotlin.math.max
 * @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/tag.rb">
 *     Tag#HASHTAG_RE</a>.
 */
-private const val TAG_REGEX = "(?:^|[^/)A-Za-z0-9_])#([\\w_]*[\\p{Alpha}_][\\w_]*)"
+private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c"
+private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions )
+private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))"

 /**
 * @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/account.rb">
 *     Account#MENTION_RE</a>
 */
-private const val MENTION_REGEX = "(?:^|[^/[:word:]])@([a-z0-9_-]+(?:@[a-z0-9\\.\\-]+[a-z0-9]+)?)"
+private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?"
+private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)"

 private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)"
 private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)"