[locale] Repair locale-matching (cherry-pick from *calamares*)

- add struct that splits a locale name into parts
- be more chatty during matching

(cherry picked from commit fd56b5bdc4)
(cherry picked from commit 78e216fedb)
(cherry picked from commit cfb8ef9f65)
(cherry picked from commit eb242168bf)
(cherry picked from commit 40527ffd4e)
(cherry picked from commit 6cbf2d7e32)
(cherry picked from commit a422fd80d9)
(cherry picked from commit 3540121449)
This commit is contained in:
Adriaan de Groot 2022-07-26 22:10:46 +02:00
parent 57374a3271
commit ab27f0aa2e
5 changed files with 346 additions and 140 deletions

View File

@ -22,6 +22,7 @@ calamares_add_plugin(locale
Config.cpp
LCLocaleDialog.cpp
LocaleConfiguration.cpp
LocaleNames.cpp
LocalePage.cpp
LocaleViewStep.cpp
SetTimezoneJob.cpp
@ -39,15 +40,7 @@ calamares_add_plugin(locale
calamares_add_test(
localetest
SOURCES
Tests.cpp
Config.cpp
LocaleConfiguration.cpp
SetTimezoneJob.cpp
timezonewidget/TimeZoneImage.cpp
DEFINITIONS
SOURCE_DIR="${CMAKE_CURRENT_LIST_DIR}/images"
DEBUG_TIMEZONES=1
LIBRARIES
Qt5::Gui
SOURCES Tests.cpp Config.cpp LocaleConfiguration.cpp LocaleNames.cpp SetTimezoneJob.cpp timezonewidget/TimeZoneImage.cpp
DEFINITIONS SOURCE_DIR="${CMAKE_CURRENT_LIST_DIR}/images" DEBUG_TIMEZONES=1
LIBRARIES Qt5::Gui
)

View File

@ -9,11 +9,13 @@
*/
#include "LocaleConfiguration.h"
#include "LocaleNames.h"
#include "utils/Logger.h"
#include <QLocale>
#include <QRegularExpression>
#include <QVector>
LocaleConfiguration::LocaleConfiguration()
: explicit_lang( false )
@ -40,6 +42,106 @@ LocaleConfiguration::setLanguage( const QString& localeName )
m_lang = localeName;
}
static LocaleNameParts
updateCountry( LocaleNameParts p, const QString& country )
{
p.country = country;
return p;
}
static QPair< int, LocaleNameParts >
identifyBestLanguageMatch( const LocaleNameParts& referenceLocale, QVector< LocaleNameParts >& others )
{
std::sort( others.begin(),
others.end(),
[ & ]( const LocaleNameParts& lhs, const LocaleNameParts& rhs )
{ return referenceLocale.similarity( lhs ) < referenceLocale.similarity( rhs ); } );
// The best match is at the end
LocaleNameParts best_match = others.last();
if ( !( referenceLocale.similarity( best_match ) > LocaleNameParts::no_match ) )
{
cDebug() << Logger::SubEntry << "Got no good match for" << referenceLocale.name();
return { LocaleNameParts::no_match, LocaleNameParts {} };
}
else
{
cDebug() << Logger::SubEntry << "Got best match for" << referenceLocale.name() << "as" << best_match.name();
return { referenceLocale.similarity( best_match ), best_match };
}
}
/** @brief Returns the QString from @p availableLocales that best-matches.
*/
static LocaleNameParts
identifyBestLanguageMatch( const QString& languageLocale,
const QStringList& availableLocales,
const QString& countryCode )
{
const QString default_lang = QStringLiteral( "en_US.UTF-8" );
const LocaleNameParts self = LocaleNameParts::fromName( languageLocale );
if ( self.isValid() && !availableLocales.isEmpty() )
{
QVector< LocaleNameParts > others;
others.resize( availableLocales.length() ); // Makes default structs
std::transform( availableLocales.begin(), availableLocales.end(), others.begin(), LocaleNameParts::fromName );
// Keep track of the best match in various attempts
int best_score = LocaleNameParts::no_match;
LocaleNameParts best_match;
// Check with the unmodified language setting
{
auto [ score, match ] = identifyBestLanguageMatch( self, others );
if ( score >= LocaleNameParts::complete_match )
{
return match;
}
else if ( score > best_score )
{
best_match = match;
}
}
// .. but it might match **better** with the chosen location country Code
{
auto [ score, match ] = identifyBestLanguageMatch( updateCountry( self, countryCode ), others );
if ( score >= LocaleNameParts::complete_match )
{
return match;
}
else if ( score > best_score )
{
best_match = match;
}
}
// .. or better yet with the QLocale-derived country
{
const QString localeCountry = LocaleNameParts::fromName( QLocale( languageLocale ).name() ).country;
auto [ score, match ] = identifyBestLanguageMatch( updateCountry( self, localeCountry ), others );
if ( score >= LocaleNameParts::complete_match )
{
return match;
}
else if ( score > best_score )
{
best_match = match;
}
}
if ( best_match.isValid() )
{
cDebug() << Logger::SubEntry << "Matched best with" << best_match.name();
return best_match;
}
}
// Else we have an unrecognized or unsupported locale, all we can do is go with
// en_US.UTF-8 UTF-8. This completes all default language setting guesswork.
return LocaleNameParts::fromName( default_lang );
}
LocaleConfiguration
LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
@ -47,100 +149,7 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
const QString& countryCode )
{
cDebug() << "Mapping" << languageLocale << "in" << countryCode << "to locale.";
QString language = languageLocale.split( '_' ).first();
QString region;
if ( language.contains( '@' ) )
{
auto r = language.split( '@' );
language = r.first();
region = r[ 1 ]; // second()
}
// Either an exact match, or the whole language part matches
// (followed by .<encoding> or _<country>
QStringList linesForLanguage = availableLocales.filter( QRegularExpression( language + "[._]" ) );
cDebug() << Logger::SubEntry << "Matching" << linesForLanguage;
QString lang;
if ( linesForLanguage.isEmpty() || languageLocale.isEmpty() )
{
lang = "en_US.UTF-8";
}
else if ( linesForLanguage.length() == 1 )
{
lang = linesForLanguage.first();
}
// lang could still be empty if we found multiple locales that satisfy myLanguage
const QString combinedLanguageAndCountry = QString( "%1_%2" ).arg( language ).arg( countryCode );
if ( lang.isEmpty() && region.isEmpty() )
{
auto l = linesForLanguage.filter(
QRegularExpression( combinedLanguageAndCountry + "[._]" ) ); // no regional variants
if ( l.length() == 1 )
{
lang = l.first();
}
}
// The following block was inspired by Ubiquity, scripts/localechooser-apply.
// No copyright statement found in file, assuming GPL v2 or later.
/* # In the special cases of Portuguese and Chinese, selecting a
# different location may imply a different dialect of the language.
# In such cases, make LANG reflect the selected language (for
# messages, character types, and collation) and make the other
# locale categories reflect the selected location. */
if ( language == "pt" || language == "zh" )
{
cDebug() << Logger::SubEntry << "Special-case Portuguese and Chinese";
QString proposedLocale = QString( "%1_%2" ).arg( language ).arg( countryCode );
for ( const QString& line : linesForLanguage )
{
if ( line.contains( proposedLocale ) )
{
cDebug() << Logger::SubEntry << "Country-variant" << line << "chosen.";
lang = line;
break;
}
}
}
if ( lang.isEmpty() && !region.isEmpty() )
{
cDebug() << Logger::SubEntry << "Special-case region @" << region;
QString proposedRegion = QString( "@%1" ).arg( region );
for ( const QString& line : linesForLanguage )
{
if ( line.startsWith( language ) && line.contains( proposedRegion ) )
{
cDebug() << Logger::SubEntry << "Region-variant" << line << "chosen.";
lang = line;
break;
}
}
}
// If we found no good way to set a default lang, do a search with the whole
// language locale and pick the first result, if any.
if ( lang.isEmpty() )
{
for ( const QString& line : availableLocales )
{
if ( line.startsWith( languageLocale ) )
{
lang = line;
break;
}
}
}
// Else we have an unrecognized or unsupported locale, all we can do is go with
// en_US.UTF-8 UTF-8. This completes all default language setting guesswork.
if ( lang.isEmpty() )
{
lang = "en_US.UTF-8";
}
const auto bestLocale = identifyBestLanguageMatch( languageLocale, availableLocales, countryCode );
// The following block was inspired by Ubiquity, scripts/localechooser-apply.
// No copyright statement found in file, assuming GPL v2 or later.
@ -188,34 +197,16 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
// We make a proposed locale based on the UI language and the timezone's country. There is no
// guarantee that this will be a valid, supported locale (often it won't).
QString lc_formats;
const QString combined = QString( "%1_%2" ).arg( language ).arg( countryCode );
if ( lang.isEmpty() )
const QString combined = QString( "%1_%2" ).arg( bestLocale.language ).arg( countryCode );
if ( availableLocales.contains( bestLocale.language ) )
{
cDebug() << Logger::SubEntry << "Looking up formats for" << combinedLanguageAndCountry;
// We look up if it's a supported locale.
for ( const QString& line : availableLocales )
{
if ( line.startsWith( combinedLanguageAndCountry ) )
{
lang = line;
lc_formats = line;
break;
}
}
cDebug() << Logger::SubEntry << "Exact formats match for language tag" << bestLocale.language;
lc_formats = bestLocale.language;
}
else
else if ( availableLocales.contains( combined ) )
{
if ( availableLocales.contains( lang ) )
{
cDebug() << Logger::SubEntry << "Exact formats match for language tag" << lang;
lc_formats = lang;
}
else if ( availableLocales.contains( combinedLanguageAndCountry ) )
{
cDebug() << Logger::SubEntry << "Exact formats match for combined" << combinedLanguageAndCountry;
lang = combinedLanguageAndCountry;
lc_formats = combinedLanguageAndCountry;
}
cDebug() << Logger::SubEntry << "Exact formats match for combined" << combined;
lc_formats = combined;
}
if ( lc_formats.isEmpty() )
@ -303,12 +294,7 @@ LocaleConfiguration::fromLanguageAndLocation( const QString& languageLocale,
// If we cannot make a good choice for a given country we go with the LANG
// setting, which defaults to en_US.UTF-8 UTF-8 if all else fails.
if ( lc_formats.isEmpty() )
{
lc_formats = lang;
}
return LocaleConfiguration( lang, lc_formats );
return LocaleConfiguration( bestLocale.name(), lc_formats.isEmpty() ? bestLocale.name() : lc_formats );
}

View File

@ -0,0 +1,90 @@
/* === This file is part of Calamares - <https://calamares.io> ===
*
* SPDX-FileCopyrightText: 2022 Adriaan de Groot <groot@kde.org>
* SPDX-License-Identifier: GPL-3.0-or-later
*
* Calamares is Free Software: see the License-Identifier above.
*
*/
#include "LocaleNames.h"
#include "utils/Logger.h"
#include <QRegularExpression>
LocaleNameParts
LocaleNameParts::fromName( const QString& name )
{
auto requireAndRemoveLeadingChar = []( QChar c, QString s )
{
if ( s.startsWith( c ) )
{
return s.remove( 0, 1 );
}
else
{
return QString();
}
};
auto parts = QRegularExpression( "^([a-zA-Z]+)(_[a-zA-Z]+)?(\\.[-a-zA-Z0-9]+)?(@[a-zA-Z]+)?" ).match( name );
const QString calamaresLanguage = parts.captured( 1 );
const QString calamaresCountry = requireAndRemoveLeadingChar( '_', parts.captured( 2 ) );
const QString calamaresEncoding = requireAndRemoveLeadingChar( '.', parts.captured( 3 ) );
const QString calamaresRegion = requireAndRemoveLeadingChar( '@', parts.captured( 4 ) );
if ( calamaresLanguage.isEmpty() )
{
return LocaleNameParts {};
}
else
{
return LocaleNameParts { calamaresLanguage, calamaresCountry, calamaresRegion, calamaresEncoding };
}
}
QString
LocaleNameParts::name() const
{
// We don't want QStringView to a temporary; force conversion
auto insertLeadingChar = []( QChar c, QString s ) -> QString
{
if ( s.isEmpty() )
{
return QString();
}
else
{
return c + s;
}
};
if ( !isValid() )
{
return QString();
}
else
{
return language + insertLeadingChar( '_', country ) + insertLeadingChar( '.', encoding )
+ insertLeadingChar( '@', region );
}
}
int
LocaleNameParts::similarity( const LocaleNameParts& other ) const
{
if ( !isValid() || !other.isValid() )
{
return 0;
}
if ( language != other.language )
{
return 0;
}
const auto matched_region = ( region == other.region ? 30 : 0 );
const auto matched_country = ( country == other.country ? ( country.isEmpty() ? 10 : 20 ) : 0 );
const auto no_other_country_given = ( ( country != other.country && other.country.isEmpty() ) ? 10 : 0 );
return 50 + matched_region + matched_country + no_other_country_given;
}

View File

@ -0,0 +1,46 @@
/* === This file is part of Calamares - <https://calamares.io> ===
*
* SPDX-FileCopyrightText: 2022 Adriaan de Groot <groot@kde.org>
* SPDX-License-Identifier: GPL-3.0-or-later
*
* Calamares is Free Software: see the License-Identifier above.
*
*/
#ifndef LOCALENAMES_H
#define LOCALENAMES_H
#include <QString>
/** @brief parts of a locale-name (e.g. "ar_LY.UTF-8", split apart)
*
* These are created from lines in `/usr/share/i18n/SUPPORTED`,
* which lists all the locales supported by the system (there
* are also other sources of the same).
*
*/
struct LocaleNameParts
{
QString language; // e.g. "ar"
QString country; // e.g. "LY" (may be empty)
QString region; // e.g. "@valencia" (may be empty)
QString encoding; // e.g. "UTF-8" (may be empty)
bool isValid() const { return !language.isEmpty(); }
QString name() const;
static LocaleNameParts fromName( const QString& name );
static inline constexpr const int no_match = 0;
static inline constexpr const int complete_match = 100;
/** @brief Compute similarity-score with another locale-name.
*
* Similarity is driven by language and region, then country.
* Returns a number between 0 (no similarity, e.g. the
* language is different) and 100 (complete match).
*/
int similarity( const LocaleNameParts& other ) const;
};
#endif

View File

@ -9,6 +9,7 @@
#include "Config.h"
#include "LocaleConfiguration.h"
#include "LocaleNames.h"
#include "timezonewidget/TimeZoneImage.h"
#include "CalamaresVersion.h"
@ -50,12 +51,16 @@ private Q_SLOTS:
void testLanguageDetection();
void testLanguageDetectionValencia();
// Check realistic language mapping for issue 2008
// Check that the test-data is available and ok
void testKDENeonLanguageData();
void testLocaleNameParts();
// Check realistic language mapping for issue 2008
void testLanguageMappingNeon_data();
void testLanguageMappingNeon();
void testLanguageMappingFreeBSD_data();
void testLanguageMappingFreeBSD();
void testLanguageSimilarity();
private:
QStringList m_KDEneonLocales;
@ -395,6 +400,10 @@ splitTestFileIntoLines( const QString& filename )
void
LocaleTests::testKDENeonLanguageData()
{
if ( !m_KDEneonLocales.isEmpty() )
{
return;
}
const QStringList neonLocales = splitTestFileIntoLines( QStringLiteral( "locale-data-neon" ) );
cDebug() << "Loaded KDE neon locales test data" << neonLocales.front() << "to" << neonLocales.back();
QCOMPARE( neonLocales.length(), 318 ); // wc -l tells me 318 lines
@ -415,7 +424,7 @@ LocaleTests::MappingData()
// Tired of writing QString or QStringLiteral all the time.
auto l = []( const char* p ) { return QString::fromUtf8( p ); };
auto u = [](){ return QString(); };
auto u = []() { return QString(); };
// The KDEneon columns include the .UTF-8 from the source data
// The FreeBSD columns may have u() to indicate "same as KDEneon",
@ -445,12 +454,14 @@ LocaleTests::MappingData()
}
void LocaleTests::testLanguageMappingNeon_data()
void
LocaleTests::testLanguageMappingNeon_data()
{
MappingData();
}
void LocaleTests::testLanguageMappingFreeBSD_data()
void
LocaleTests::testLanguageMappingFreeBSD_data()
{
MappingData();
}
@ -458,6 +469,7 @@ void LocaleTests::testLanguageMappingFreeBSD_data()
void
LocaleTests::testLanguageMappingNeon()
{
testKDENeonLanguageData();
QVERIFY( !m_KDEneonLocales.isEmpty() );
QFETCH( QString, selectedLanguage );
@ -474,6 +486,7 @@ LocaleTests::testLanguageMappingNeon()
void
LocaleTests::testLanguageMappingFreeBSD()
{
testKDENeonLanguageData();
QVERIFY( !m_FreeBSDLocales.isEmpty() );
QFETCH( QString, selectedLanguage );
@ -488,6 +501,84 @@ LocaleTests::testLanguageMappingFreeBSD()
QCOMPARE( bsd.language(), expected );
}
void
LocaleTests::testLocaleNameParts()
{
testKDENeonLanguageData();
QVERIFY( !m_FreeBSDLocales.isEmpty() );
QVERIFY( !m_KDEneonLocales.isEmpty() );
// Example constant locales
{
auto c_parts = LocaleNameParts::fromName( QStringLiteral( "nl_NL.UTF-8" ) );
QCOMPARE( c_parts.language, QStringLiteral( "nl" ) );
QCOMPARE( c_parts.country, QStringLiteral( "NL" ) );
QCOMPARE( c_parts.encoding, QStringLiteral( "UTF-8" ) );
QVERIFY( c_parts.region.isEmpty() );
}
{
auto c_parts = LocaleNameParts::fromName( QStringLiteral( "C.UTF-8" ) );
QCOMPARE( c_parts.language, QStringLiteral( "C" ) );
QVERIFY( c_parts.country.isEmpty() );
QCOMPARE( c_parts.encoding, QStringLiteral( "UTF-8" ) );
QVERIFY( c_parts.region.isEmpty() );
}
// Check all the loaded test locales
for ( const auto& s : m_FreeBSDLocales )
{
auto parts = LocaleNameParts::fromName( s );
QVERIFY( parts.isValid() );
QCOMPARE( parts.name(), s );
}
for ( const auto& s : m_KDEneonLocales )
{
auto parts = LocaleNameParts::fromName( s );
QVERIFY( parts.isValid() );
QCOMPARE( parts.name(), s );
}
}
void
LocaleTests::testLanguageSimilarity()
{
// Empty
{
QCOMPARE( LocaleNameParts().similarity( LocaleNameParts() ), 0 );
}
// Some simple Dutch situations
{
auto nl_parts = LocaleNameParts::fromName( QStringLiteral( "nl_NL.UTF-8" ) );
auto be_parts = LocaleNameParts::fromName( QStringLiteral( "nl_BE.UTF-8" ) );
auto nl_short_parts = LocaleNameParts::fromName( QStringLiteral( "nl" ) );
QCOMPARE( nl_parts.similarity( nl_parts ), 100 );
QCOMPARE( nl_parts.similarity( LocaleNameParts() ), 0 );
QCOMPARE( nl_parts.similarity( be_parts ), 80 ); // Language + (empty) region match
QCOMPARE( nl_parts.similarity( nl_short_parts ), 90 );
}
// Everything matches itself
{
if ( m_KDEneonLocales.isEmpty() )
{
testKDENeonLanguageData();
}
QVERIFY( !m_FreeBSDLocales.isEmpty() );
QVERIFY( !m_KDEneonLocales.isEmpty() );
for ( const auto& l : m_KDEneonLocales )
{
auto locale_name = LocaleNameParts::fromName( l );
auto self_similarity = locale_name.similarity( locale_name );
if ( self_similarity != 100 )
{
cDebug() << "Locale" << l << "is unusual.";
}
QCOMPARE( self_similarity, 100 );
}
}
}
#include "utils/moc-warnings.h"