From ab27f0aa2e9a311d22d96af535aa29e9842fc768 Mon Sep 17 00:00:00 2001 From: Adriaan de Groot Date: Tue, 26 Jul 2022 22:10:46 +0200 Subject: [PATCH 1/2] [locale] Repair locale-matching (cherry-pick from *calamares*) - add struct that splits a locale name into parts - be more chatty during matching (cherry picked from commit fd56b5bdc43f88bcaa44d728c3f157726c14b395) (cherry picked from commit 78e216fedbbcdcf4fc40c5a69e075521aa16be80) (cherry picked from commit cfb8ef9f65da630203c6ecd41b6ac13f8ac9c460) (cherry picked from commit eb242168bf4ac80aa920d795a9b0cbec632af8f1) (cherry picked from commit 40527ffd4e6585038ac9cbbe3e75853a7b6fd6e0) (cherry picked from commit 6cbf2d7e3292f4d15af1452117d4ecc6b3786caa) (cherry picked from commit a422fd80d98a5cdbd6db1d9f5405a1bd432bdff9) (cherry picked from commit 35401214499ff6ed98b1dd143883e7b7d4d84130) --- src/modules/locale/CMakeLists.txt | 15 +- src/modules/locale/LocaleConfiguration.cpp | 236 ++++++++++----------- src/modules/locale/LocaleNames.cpp | 90 ++++++++ src/modules/locale/LocaleNames.h | 46 ++++ src/modules/locale/Tests.cpp | 99 ++++++++- 5 files changed, 346 insertions(+), 140 deletions(-) create mode 100644 src/modules/locale/LocaleNames.cpp create mode 100644 src/modules/locale/LocaleNames.h diff --git a/src/modules/locale/CMakeLists.txt b/src/modules/locale/CMakeLists.txt index 6543d7d94..b631f77f7 100644 --- a/src/modules/locale/CMakeLists.txt +++ b/src/modules/locale/CMakeLists.txt @@ -22,6 +22,7 @@ calamares_add_plugin(locale Config.cpp LCLocaleDialog.cpp LocaleConfiguration.cpp + LocaleNames.cpp LocalePage.cpp LocaleViewStep.cpp SetTimezoneJob.cpp @@ -39,15 +40,7 @@ calamares_add_plugin(locale calamares_add_test( localetest - SOURCES - Tests.cpp - Config.cpp - LocaleConfiguration.cpp - SetTimezoneJob.cpp - timezonewidget/TimeZoneImage.cpp - DEFINITIONS - SOURCE_DIR="${CMAKE_CURRENT_LIST_DIR}/images" - DEBUG_TIMEZONES=1 - LIBRARIES - Qt5::Gui + SOURCES Tests.cpp Config.cpp LocaleConfiguration.cpp LocaleNames.cpp SetTimezoneJob.cpp timezonewidget/TimeZoneImage.cpp + DEFINITIONS SOURCE_DIR="${CMAKE_CURRENT_LIST_DIR}/images" DEBUG_TIMEZONES=1 + LIBRARIES Qt5::Gui ) diff --git a/src/modules/locale/LocaleConfiguration.cpp b/src/modules/locale/LocaleConfiguration.cpp index 17953f079..c62b1ab08 100644 --- a/src/modules/locale/LocaleConfiguration.cpp +++ b/src/modules/locale/LocaleConfiguration.cpp @@ -9,11 +9,13 @@ */ #include "LocaleConfiguration.h" +#include "LocaleNames.h" #include "utils/Logger.h" #include #include +#include LocaleConfiguration::LocaleConfiguration() : explicit_lang( false ) @@ -40,6 +42,106 @@ LocaleConfiguration::setLanguage( const QString& localeName ) m_lang = localeName; } +static LocaleNameParts +updateCountry( LocaleNameParts p, const QString& country ) +{ + p.country = country; + return p; +} + +static QPair< int, LocaleNameParts > +identifyBestLanguageMatch( const LocaleNameParts& referenceLocale, QVector< LocaleNameParts >& others ) +{ + std::sort( others.begin(), + others.end(), + [ & ]( const LocaleNameParts& lhs, const LocaleNameParts& rhs ) + { return referenceLocale.similarity( lhs ) < referenceLocale.similarity( rhs ); } ); + // The best match is at the end + LocaleNameParts best_match = others.last(); + if ( !( referenceLocale.similarity( best_match ) > LocaleNameParts::no_match ) ) + { + cDebug() << Logger::SubEntry << "Got no good match for" << referenceLocale.name(); + return { LocaleNameParts::no_match, LocaleNameParts {} }; + } + else + { + cDebug() << Logger::SubEntry << "Got best match for" << referenceLocale.name() << "as" << best_match.name(); + return { referenceLocale.similarity( best_match ), best_match }; + } +} + +/** @brief Returns the QString from @p availableLocales that best-matches. + */ +static LocaleNameParts +identifyBestLanguageMatch( const QString& languageLocale, + const QStringList& availableLocales, + const QString& countryCode ) +{ + const QString default_lang = QStringLiteral( "en_US.UTF-8" ); + + const LocaleNameParts self = LocaleNameParts::fromName( languageLocale ); + if ( self.isValid() && !availableLocales.isEmpty() ) + { + QVector< LocaleNameParts > others; + others.resize( availableLocales.length() ); // Makes default structs + std::transform( availableLocales.begin(), availableLocales.end(), others.begin(), LocaleNameParts::fromName ); + + // Keep track of the best match in various attempts + int best_score = LocaleNameParts::no_match; + LocaleNameParts best_match; + + // Check with the unmodified language setting + { + auto [ score, match ] = identifyBestLanguageMatch( self, others ); + if ( score >= LocaleNameParts::complete_match ) + { + return match; + } + else if ( score > best_score ) + { + best_match = match; + } + } + + + // .. but it might match **better** with the chosen location country Code + { + auto [ score, match ] = identifyBestLanguageMatch( updateCountry( self, countryCode ), others ); + if ( score >= LocaleNameParts::complete_match ) + { + return match; + } + else if ( score > best_score ) + { + best_match = match; + } + } + + // .. or better yet with the QLocale-derived country + { + const QString localeCountry = LocaleNameParts::fromName( QLocale( languageLocale ).name() ).country; + auto [ score, match ] = identifyBestLanguageMatch( updateCountry( self, localeCountry ), others ); + if ( score >= LocaleNameParts::complete_match ) + { + return match; + } + else if ( score > best_score ) + { + best_match = match; + } + } + + if ( best_match.isValid() ) + { + cDebug() << Logger::SubEntry << "Matched best with" << best_match.name(); + return best_match; + } + } + + // Else we have an unrecognized or unsupported locale, all we can do is go with + // en_US.UTF-8 UTF-8. This completes all default language setting guesswork. + return LocaleNameParts::fromName( default_lang ); +} LocaleConfiguration LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale, @@ -47,100 +149,7 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale, const QString& countryCode ) { cDebug() << "Mapping" << languageLocale << "in" << countryCode << "to locale."; - QString language = languageLocale.split( '_' ).first(); - QString region; - if ( language.contains( '@' ) ) - { - auto r = language.split( '@' ); - language = r.first(); - region = r[ 1 ]; // second() - } - - // Either an exact match, or the whole language part matches - // (followed by . or _ - QStringList linesForLanguage = availableLocales.filter( QRegularExpression( language + "[._]" ) ); - cDebug() << Logger::SubEntry << "Matching" << linesForLanguage; - - QString lang; - if ( linesForLanguage.isEmpty() || languageLocale.isEmpty() ) - { - lang = "en_US.UTF-8"; - } - else if ( linesForLanguage.length() == 1 ) - { - lang = linesForLanguage.first(); - } - - // lang could still be empty if we found multiple locales that satisfy myLanguage - const QString combinedLanguageAndCountry = QString( "%1_%2" ).arg( language ).arg( countryCode ); - if ( lang.isEmpty() && region.isEmpty() ) - { - auto l = linesForLanguage.filter( - QRegularExpression( combinedLanguageAndCountry + "[._]" ) ); // no regional variants - if ( l.length() == 1 ) - { - lang = l.first(); - } - } - - // The following block was inspired by Ubiquity, scripts/localechooser-apply. - // No copyright statement found in file, assuming GPL v2 or later. - /* # In the special cases of Portuguese and Chinese, selecting a - # different location may imply a different dialect of the language. - # In such cases, make LANG reflect the selected language (for - # messages, character types, and collation) and make the other - # locale categories reflect the selected location. */ - if ( language == "pt" || language == "zh" ) - { - cDebug() << Logger::SubEntry << "Special-case Portuguese and Chinese"; - QString proposedLocale = QString( "%1_%2" ).arg( language ).arg( countryCode ); - for ( const QString& line : linesForLanguage ) - { - if ( line.contains( proposedLocale ) ) - { - cDebug() << Logger::SubEntry << "Country-variant" << line << "chosen."; - lang = line; - break; - } - } - } - if ( lang.isEmpty() && !region.isEmpty() ) - { - cDebug() << Logger::SubEntry << "Special-case region @" << region; - QString proposedRegion = QString( "@%1" ).arg( region ); - for ( const QString& line : linesForLanguage ) - { - if ( line.startsWith( language ) && line.contains( proposedRegion ) ) - { - cDebug() << Logger::SubEntry << "Region-variant" << line << "chosen."; - lang = line; - break; - } - } - } - - - // If we found no good way to set a default lang, do a search with the whole - // language locale and pick the first result, if any. - if ( lang.isEmpty() ) - { - for ( const QString& line : availableLocales ) - { - if ( line.startsWith( languageLocale ) ) - { - lang = line; - break; - } - } - } - - // Else we have an unrecognized or unsupported locale, all we can do is go with - // en_US.UTF-8 UTF-8. This completes all default language setting guesswork. - if ( lang.isEmpty() ) - { - lang = "en_US.UTF-8"; - } - + const auto bestLocale = identifyBestLanguageMatch( languageLocale, availableLocales, countryCode ); // The following block was inspired by Ubiquity, scripts/localechooser-apply. // No copyright statement found in file, assuming GPL v2 or later. @@ -188,34 +197,16 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale, // We make a proposed locale based on the UI language and the timezone's country. There is no // guarantee that this will be a valid, supported locale (often it won't). QString lc_formats; - const QString combined = QString( "%1_%2" ).arg( language ).arg( countryCode ); - if ( lang.isEmpty() ) + const QString combined = QString( "%1_%2" ).arg( bestLocale.language ).arg( countryCode ); + if ( availableLocales.contains( bestLocale.language ) ) { - cDebug() << Logger::SubEntry << "Looking up formats for" << combinedLanguageAndCountry; - // We look up if it's a supported locale. - for ( const QString& line : availableLocales ) - { - if ( line.startsWith( combinedLanguageAndCountry ) ) - { - lang = line; - lc_formats = line; - break; - } - } + cDebug() << Logger::SubEntry << "Exact formats match for language tag" << bestLocale.language; + lc_formats = bestLocale.language; } - else + else if ( availableLocales.contains( combined ) ) { - if ( availableLocales.contains( lang ) ) - { - cDebug() << Logger::SubEntry << "Exact formats match for language tag" << lang; - lc_formats = lang; - } - else if ( availableLocales.contains( combinedLanguageAndCountry ) ) - { - cDebug() << Logger::SubEntry << "Exact formats match for combined" << combinedLanguageAndCountry; - lang = combinedLanguageAndCountry; - lc_formats = combinedLanguageAndCountry; - } + cDebug() << Logger::SubEntry << "Exact formats match for combined" << combined; + lc_formats = combined; } if ( lc_formats.isEmpty() ) @@ -303,12 +294,7 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale, // If we cannot make a good choice for a given country we go with the LANG // setting, which defaults to en_US.UTF-8 UTF-8 if all else fails. - if ( lc_formats.isEmpty() ) - { - lc_formats = lang; - } - - return LocaleConfiguration( lang, lc_formats ); + return LocaleConfiguration( bestLocale.name(), lc_formats.isEmpty() ? bestLocale.name() : lc_formats ); } diff --git a/src/modules/locale/LocaleNames.cpp b/src/modules/locale/LocaleNames.cpp new file mode 100644 index 000000000..401aa4809 --- /dev/null +++ b/src/modules/locale/LocaleNames.cpp @@ -0,0 +1,90 @@ +/* === This file is part of Calamares - === + * + * SPDX-FileCopyrightText: 2022 Adriaan de Groot + * SPDX-License-Identifier: GPL-3.0-or-later + * + * Calamares is Free Software: see the License-Identifier above. + * + */ + +#include "LocaleNames.h" + +#include "utils/Logger.h" + +#include + +LocaleNameParts +LocaleNameParts::fromName( const QString& name ) +{ + auto requireAndRemoveLeadingChar = []( QChar c, QString s ) + { + if ( s.startsWith( c ) ) + { + return s.remove( 0, 1 ); + } + else + { + return QString(); + } + }; + + auto parts = QRegularExpression( "^([a-zA-Z]+)(_[a-zA-Z]+)?(\\.[-a-zA-Z0-9]+)?(@[a-zA-Z]+)?" ).match( name ); + const QString calamaresLanguage = parts.captured( 1 ); + const QString calamaresCountry = requireAndRemoveLeadingChar( '_', parts.captured( 2 ) ); + const QString calamaresEncoding = requireAndRemoveLeadingChar( '.', parts.captured( 3 ) ); + const QString calamaresRegion = requireAndRemoveLeadingChar( '@', parts.captured( 4 ) ); + + if ( calamaresLanguage.isEmpty() ) + { + return LocaleNameParts {}; + } + else + { + return LocaleNameParts { calamaresLanguage, calamaresCountry, calamaresRegion, calamaresEncoding }; + } +} + +QString +LocaleNameParts::name() const +{ + // We don't want QStringView to a temporary; force conversion + auto insertLeadingChar = []( QChar c, QString s ) -> QString + { + if ( s.isEmpty() ) + { + return QString(); + } + else + { + return c + s; + } + }; + + if ( !isValid() ) + { + return QString(); + } + else + { + return language + insertLeadingChar( '_', country ) + insertLeadingChar( '.', encoding ) + + insertLeadingChar( '@', region ); + } +} + + +int +LocaleNameParts::similarity( const LocaleNameParts& other ) const +{ + if ( !isValid() || !other.isValid() ) + { + return 0; + } + if ( language != other.language ) + { + return 0; + } + const auto matched_region = ( region == other.region ? 30 : 0 ); + const auto matched_country = ( country == other.country ? ( country.isEmpty() ? 10 : 20 ) : 0 ); + const auto no_other_country_given = ( ( country != other.country && other.country.isEmpty() ) ? 10 : 0 ); + return 50 + matched_region + matched_country + no_other_country_given; +} diff --git a/src/modules/locale/LocaleNames.h b/src/modules/locale/LocaleNames.h new file mode 100644 index 000000000..8498aa28a --- /dev/null +++ b/src/modules/locale/LocaleNames.h @@ -0,0 +1,46 @@ +/* === This file is part of Calamares - === + * + * SPDX-FileCopyrightText: 2022 Adriaan de Groot + * SPDX-License-Identifier: GPL-3.0-or-later + * + * Calamares is Free Software: see the License-Identifier above. + * + */ + +#ifndef LOCALENAMES_H +#define LOCALENAMES_H + +#include + +/** @brief parts of a locale-name (e.g. "ar_LY.UTF-8", split apart) + * + * These are created from lines in `/usr/share/i18n/SUPPORTED`, + * which lists all the locales supported by the system (there + * are also other sources of the same). + * + */ +struct LocaleNameParts +{ + QString language; // e.g. "ar" + QString country; // e.g. "LY" (may be empty) + QString region; // e.g. "@valencia" (may be empty) + QString encoding; // e.g. "UTF-8" (may be empty) + + bool isValid() const { return !language.isEmpty(); } + QString name() const; + + static LocaleNameParts fromName( const QString& name ); + + static inline constexpr const int no_match = 0; + static inline constexpr const int complete_match = 100; + + /** @brief Compute similarity-score with another locale-name. + * + * Similarity is driven by language and region, then country. + * Returns a number between 0 (no similarity, e.g. the + * language is different) and 100 (complete match). + */ + int similarity( const LocaleNameParts& other ) const; +}; + +#endif diff --git a/src/modules/locale/Tests.cpp b/src/modules/locale/Tests.cpp index 69a6a9258..1e1551992 100644 --- a/src/modules/locale/Tests.cpp +++ b/src/modules/locale/Tests.cpp @@ -9,6 +9,7 @@ #include "Config.h" #include "LocaleConfiguration.h" +#include "LocaleNames.h" #include "timezonewidget/TimeZoneImage.h" #include "CalamaresVersion.h" @@ -50,12 +51,16 @@ private Q_SLOTS: void testLanguageDetection(); void testLanguageDetectionValencia(); - // Check realistic language mapping for issue 2008 + // Check that the test-data is available and ok void testKDENeonLanguageData(); + void testLocaleNameParts(); + + // Check realistic language mapping for issue 2008 void testLanguageMappingNeon_data(); void testLanguageMappingNeon(); void testLanguageMappingFreeBSD_data(); void testLanguageMappingFreeBSD(); + void testLanguageSimilarity(); private: QStringList m_KDEneonLocales; @@ -395,6 +400,10 @@ splitTestFileIntoLines( const QString& filename ) void LocaleTests::testKDENeonLanguageData() { + if ( !m_KDEneonLocales.isEmpty() ) + { + return; + } const QStringList neonLocales = splitTestFileIntoLines( QStringLiteral( "locale-data-neon" ) ); cDebug() << "Loaded KDE neon locales test data" << neonLocales.front() << "to" << neonLocales.back(); QCOMPARE( neonLocales.length(), 318 ); // wc -l tells me 318 lines @@ -415,7 +424,7 @@ LocaleTests::MappingData() // Tired of writing QString or QStringLiteral all the time. auto l = []( const char* p ) { return QString::fromUtf8( p ); }; - auto u = [](){ return QString(); }; + auto u = []() { return QString(); }; // The KDEneon columns include the .UTF-8 from the source data // The FreeBSD columns may have u() to indicate "same as KDEneon", @@ -445,12 +454,14 @@ LocaleTests::MappingData() } -void LocaleTests::testLanguageMappingNeon_data() +void +LocaleTests::testLanguageMappingNeon_data() { MappingData(); } -void LocaleTests::testLanguageMappingFreeBSD_data() +void +LocaleTests::testLanguageMappingFreeBSD_data() { MappingData(); } @@ -458,6 +469,7 @@ void LocaleTests::testLanguageMappingFreeBSD_data() void LocaleTests::testLanguageMappingNeon() { + testKDENeonLanguageData(); QVERIFY( !m_KDEneonLocales.isEmpty() ); QFETCH( QString, selectedLanguage ); @@ -474,6 +486,7 @@ LocaleTests::testLanguageMappingNeon() void LocaleTests::testLanguageMappingFreeBSD() { + testKDENeonLanguageData(); QVERIFY( !m_FreeBSDLocales.isEmpty() ); QFETCH( QString, selectedLanguage ); @@ -488,6 +501,84 @@ LocaleTests::testLanguageMappingFreeBSD() QCOMPARE( bsd.language(), expected ); } +void +LocaleTests::testLocaleNameParts() +{ + testKDENeonLanguageData(); + QVERIFY( !m_FreeBSDLocales.isEmpty() ); + QVERIFY( !m_KDEneonLocales.isEmpty() ); + + // Example constant locales + { + auto c_parts = LocaleNameParts::fromName( QStringLiteral( "nl_NL.UTF-8" ) ); + QCOMPARE( c_parts.language, QStringLiteral( "nl" ) ); + QCOMPARE( c_parts.country, QStringLiteral( "NL" ) ); + QCOMPARE( c_parts.encoding, QStringLiteral( "UTF-8" ) ); + QVERIFY( c_parts.region.isEmpty() ); + } + { + auto c_parts = LocaleNameParts::fromName( QStringLiteral( "C.UTF-8" ) ); + QCOMPARE( c_parts.language, QStringLiteral( "C" ) ); + QVERIFY( c_parts.country.isEmpty() ); + QCOMPARE( c_parts.encoding, QStringLiteral( "UTF-8" ) ); + QVERIFY( c_parts.region.isEmpty() ); + } + + // Check all the loaded test locales + for ( const auto& s : m_FreeBSDLocales ) + { + auto parts = LocaleNameParts::fromName( s ); + QVERIFY( parts.isValid() ); + QCOMPARE( parts.name(), s ); + } + + for ( const auto& s : m_KDEneonLocales ) + { + auto parts = LocaleNameParts::fromName( s ); + QVERIFY( parts.isValid() ); + QCOMPARE( parts.name(), s ); + } +} + +void +LocaleTests::testLanguageSimilarity() +{ + // Empty + { + QCOMPARE( LocaleNameParts().similarity( LocaleNameParts() ), 0 ); + } + // Some simple Dutch situations + { + auto nl_parts = LocaleNameParts::fromName( QStringLiteral( "nl_NL.UTF-8" ) ); + auto be_parts = LocaleNameParts::fromName( QStringLiteral( "nl_BE.UTF-8" ) ); + auto nl_short_parts = LocaleNameParts::fromName( QStringLiteral( "nl" ) ); + QCOMPARE( nl_parts.similarity( nl_parts ), 100 ); + QCOMPARE( nl_parts.similarity( LocaleNameParts() ), 0 ); + QCOMPARE( nl_parts.similarity( be_parts ), 80 ); // Language + (empty) region match + QCOMPARE( nl_parts.similarity( nl_short_parts ), 90 ); + } + + // Everything matches itself + { + if ( m_KDEneonLocales.isEmpty() ) + { + testKDENeonLanguageData(); + } + QVERIFY( !m_FreeBSDLocales.isEmpty() ); + QVERIFY( !m_KDEneonLocales.isEmpty() ); + for ( const auto& l : m_KDEneonLocales ) + { + auto locale_name = LocaleNameParts::fromName( l ); + auto self_similarity = locale_name.similarity( locale_name ); + if ( self_similarity != 100 ) + { + cDebug() << "Locale" << l << "is unusual."; + } + QCOMPARE( self_similarity, 100 ); + } + } +} + #include "utils/moc-warnings.h" From 3f5d656c61188876538824656d379432a755d591 Mon Sep 17 00:00:00 2001 From: Adriaan de Groot Date: Sun, 14 Aug 2022 17:16:12 +0200 Subject: [PATCH 2/2] [localeq] Needs more shared sources from locale (cherry picked from commit a988298a653295ef674061e81ca2d49d982d0a9f) --- src/modules/localeq/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/modules/localeq/CMakeLists.txt b/src/modules/localeq/CMakeLists.txt index 85b3721d9..d9741e506 100644 --- a/src/modules/localeq/CMakeLists.txt +++ b/src/modules/localeq/CMakeLists.txt @@ -41,8 +41,9 @@ calamares_add_plugin(localeq EXPORT_MACRO PLUGINDLLEXPORT_PRO SOURCES LocaleQmlViewStep.cpp - ${_locale}/LocaleConfiguration.cpp ${_locale}/Config.cpp + ${_locale}/LocaleConfiguration.cpp + ${_locale}/LocaleNames.cpp ${_locale}/SetTimezoneJob.cpp RESOURCES localeq.qrc