using System.Text.RegularExpressions; namespace Parser.Core; public static class LanguageParser { private static readonly Regex LanguageRegex = new(@"(?:\W|_|^)(?\beng\b)| (?\b(?:ita|italian)\b)| (?(?:swiss)?german\b|videomann|ger[. ]dub|\bger\b)| (?flemish)| (?bgaudio)| (?rodubbed)| (?\b(dublado|pt-BR)\b)| (?greek)| (?\b(?:FR|VO|VF|VFF|VFQ|VFI|VF2|TRUEFRENCH|FRENCH|FRE|FRA)\b)| (?\b(?:rus|ru)\b)| (?\b(?:HUNDUB|HUN)\b)| (?\b(?:HebDub|HebDubbed)\b)| (?\b(?:PL\W?DUB|DUB\W?PL|LEK\W?PL|PL\W?LEK)\b)| (?\[(?:CH[ST]|BIG5|GB)\]|简|繁|字幕)| (?(?:(?:\dx)?UKR))| (?\b(?:español|castellano)\b)| (?\b(?:catalan?|catalán|català)\b)| (?\b(?:lat|lav|lv)\b)| (?\btel\b)| (?\bVIE\b)| (?\bJAP\b)| (?\bKOR\b)| (?\burdu\b)| (?\b(?:romansh|rumantsch|romansch)\b)| (?\b(?:mongolian|khalkha)\b)| (?\b(?:georgian|geo|ka|kat)\b)| (?\b(?:orig|original)\b)", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace); private static readonly Regex CaseSensitiveLanguageRegex = new(@"(?:(?i)(?\bEN\b)| (?\bLT\b)| (?\bCZ\b)| (?\bPL\b)| (?\bBG\b)| (?\bSK\b)| (?\bDE\b)| (?\b(? ParseLanguages(string title) { var lowerTitle = title.ToLower(); var languages = new List(); // Full word matches if (lowerTitle.Contains("english")) languages.Add(Language.English); if (lowerTitle.Contains("spanish")) languages.Add(Language.Spanish); if (lowerTitle.Contains("danish")) languages.Add(Language.Danish); if (lowerTitle.Contains("dutch")) languages.Add(Language.Dutch); if (lowerTitle.Contains("japanese")) languages.Add(Language.Japanese); if (lowerTitle.Contains("icelandic")) languages.Add(Language.Icelandic); if (lowerTitle.Contains("mandarin") || lowerTitle.Contains("cantonese") || lowerTitle.Contains("chinese")) languages.Add(Language.Chinese); if (lowerTitle.Contains("korean")) languages.Add(Language.Korean); if (lowerTitle.Contains("russian")) languages.Add(Language.Russian); if (lowerTitle.Contains("romanian")) languages.Add(Language.Romanian); if (lowerTitle.Contains("hindi")) languages.Add(Language.Hindi); if (lowerTitle.Contains("arabic")) languages.Add(Language.Arabic); if (lowerTitle.Contains("thai")) languages.Add(Language.Thai); if (lowerTitle.Contains("bulgarian")) languages.Add(Language.Bulgarian); if (lowerTitle.Contains("polish")) languages.Add(Language.Polish); if (lowerTitle.Contains("vietnamese")) languages.Add(Language.Vietnamese); if (lowerTitle.Contains("swedish")) languages.Add(Language.Swedish); if (lowerTitle.Contains("norwegian")) languages.Add(Language.Norwegian); if (lowerTitle.Contains("finnish")) languages.Add(Language.Finnish); if (lowerTitle.Contains("turkish")) languages.Add(Language.Turkish); if (lowerTitle.Contains("portuguese")) languages.Add(Language.Portuguese); if (lowerTitle.Contains("brazilian")) languages.Add(Language.PortugueseBR); if (lowerTitle.Contains("hungarian")) languages.Add(Language.Hungarian); if (lowerTitle.Contains("hebrew")) languages.Add(Language.Hebrew); if (lowerTitle.Contains("ukrainian")) languages.Add(Language.Ukrainian); if (lowerTitle.Contains("persian")) languages.Add(Language.Persian); if (lowerTitle.Contains("bengali")) languages.Add(Language.Bengali); if (lowerTitle.Contains("slovak")) languages.Add(Language.Slovak); if (lowerTitle.Contains("latvian")) languages.Add(Language.Latvian); if (lowerTitle.Contains("latino")) languages.Add(Language.SpanishLatino); if (lowerTitle.Contains("tamil")) languages.Add(Language.Tamil); if (lowerTitle.Contains("telugu")) languages.Add(Language.Telugu); if (lowerTitle.Contains("malayalam")) languages.Add(Language.Malayalam); if (lowerTitle.Contains("kannada")) languages.Add(Language.Kannada); if (lowerTitle.Contains("albanian")) languages.Add(Language.Albanian); if (lowerTitle.Contains("afrikaans")) languages.Add(Language.Afrikaans); if (lowerTitle.Contains("marathi")) languages.Add(Language.Marathi); if (lowerTitle.Contains("tagalog")) languages.Add(Language.Tagalog); // Case-sensitive regex matches var caseSensitiveMatches = CaseSensitiveLanguageRegex.Matches(title); foreach (Match match in caseSensitiveMatches) { if (match.Groups["english"].Success) languages.Add(Language.English); if (match.Groups["lithuanian"].Success) languages.Add(Language.Lithuanian); if (match.Groups["czech"].Success) languages.Add(Language.Czech); if (match.Groups["polish"].Success) languages.Add(Language.Polish); if (match.Groups["bulgarian"].Success) languages.Add(Language.Bulgarian); if (match.Groups["slovak"].Success) languages.Add(Language.Slovak); if (match.Groups["spanish"].Success) languages.Add(Language.Spanish); if (match.Groups["german"].Success) languages.Add(Language.German); } // Case-insensitive regex matches var matches = LanguageRegex.Matches(title); foreach (Match match in matches) { if (match.Groups["english"].Success) languages.Add(Language.English); if (match.Groups["italian"].Success) languages.Add(Language.Italian); if (match.Groups["german"].Success) languages.Add(Language.German); if (match.Groups["flemish"].Success) languages.Add(Language.Flemish); if (match.Groups["greek"].Success) languages.Add(Language.Greek); if (match.Groups["french"].Success) languages.Add(Language.French); if (match.Groups["russian"].Success) languages.Add(Language.Russian); if (match.Groups["bulgarian"].Success) languages.Add(Language.Bulgarian); if (match.Groups["brazilian"].Success) languages.Add(Language.PortugueseBR); if (match.Groups["hungarian"].Success) languages.Add(Language.Hungarian); if (match.Groups["hebrew"].Success) languages.Add(Language.Hebrew); if (match.Groups["polish"].Success) languages.Add(Language.Polish); if (match.Groups["chinese"].Success) languages.Add(Language.Chinese); if (match.Groups["spanish"].Success) languages.Add(Language.Spanish); if (match.Groups["catalan"].Success) languages.Add(Language.Catalan); if (match.Groups["ukrainian"].Success) languages.Add(Language.Ukrainian); if (match.Groups["latvian"].Success) languages.Add(Language.Latvian); if (match.Groups["romanian"].Success) languages.Add(Language.Romanian); if (match.Groups["telugu"].Success) languages.Add(Language.Telugu); if (match.Groups["vietnamese"].Success) languages.Add(Language.Vietnamese); if (match.Groups["japanese"].Success) languages.Add(Language.Japanese); if (match.Groups["korean"].Success) languages.Add(Language.Korean); if (match.Groups["urdu"].Success) languages.Add(Language.Urdu); if (match.Groups["romansh"].Success) languages.Add(Language.Romansh); if (match.Groups["mongolian"].Success) languages.Add(Language.Mongolian); if (match.Groups["georgian"].Success) languages.Add(Language.Georgian); if (match.Groups["original"].Success) languages.Add(Language.Original); } // Default to Unknown if no languages detected if (languages.Count == 0) { languages.Add(Language.Unknown); } // German dual-language and multi-language handling if (languages.Count == 1 && languages[0] == Language.German) { if (GermanDualLanguageRegex.IsMatch(title)) { languages.Add(Language.Original); } else if (GermanMultiLanguageRegex.IsMatch(title)) { languages.Add(Language.Original); languages.Add(Language.English); } } // Return distinct languages return languages.Distinct().ToList(); } }