Files
profilarr/services/parser/Core/TitleParser.cs
Sam Chau 5c26d6d7b2 feat(parser): implement C# parser microservice with regex-based title parsing
- Added RegexReplace class for handling regex replacements.
- Created ReleaseGroupParser for extracting release groups from titles.
- Developed TitleParser for parsing movie titles, including editions and IDs.
- Introduced QualitySource, Resolution, QualityModifier enums and QualityResult class for quality metadata.
- Set up Dockerfile and docker-compose for containerized deployment.
- Implemented ASP.NET Core web API for parsing requests.
- Added TypeScript client for interacting with the parser service.
- Enhanced configuration to support dynamic parser service URL.
2025-12-30 10:33:52 +10:30

422 lines
16 KiB
C#

using System.Text.RegularExpressions;
namespace Parser.Core;
public class ParsedMovieInfo
{
public List<string> MovieTitles { get; set; } = new();
public string PrimaryMovieTitle => MovieTitles.FirstOrDefault() ?? "";
public int Year { get; set; }
public string? Edition { get; set; }
public string? ImdbId { get; set; }
public int TmdbId { get; set; }
public string? ReleaseHash { get; set; }
public string? HardcodedSubs { get; set; }
}
public static class TitleParser
{
private static readonly Regex EditionRegex = new(
@"\(?\b(?<edition>(((Recut.|Extended.|Ultimate.)?(Director.?s|Collector.?s|Theatrical|Ultimate|Extended|Despecialized|(Special|Rouge|Final|Assembly|Imperial|Diamond|Signature|Hunter|Rekall)(?=(.(Cut|Edition|Version)))|\d{2,3}(th)?.Anniversary)(?:.(Cut|Edition|Version))?(.(Extended|Uncensored|Remastered|Unrated|Uncut|Open.?Matte|IMAX|Fan.?Edit))?|((Uncensored|Remastered|Unrated|Uncut|Open?.Matte|IMAX|Fan.?Edit|Restored|((2|3|4)in1))))))\b\)?",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex ReportEditionRegex = new(
@"^.+?" + EditionRegex,
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex HardcodedSubsRegex = new(
@"\b((?<hcsub>(\w+(?<!SOFT|MULTI|HORRIBLE)SUBS?))|(?<hc>(HC|SUBBED)))\b",
RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace);
private static readonly Regex[] ReportMovieTitleRegex = new[]
{
// Anime [Subgroup] and Year
new Regex(@"^(?:\[(?<subgroup>.+?)\][-_. ]?)(?<title>(?![(\[]).+?)?(?:(?:[-_\W](?<![)\[!]))*(?<year>(1(8|9)|20)\d{2}(?!p|i|x|\d+|\]|\W\d+)))+.*?(?<hash>\[\w{8}\])?(?:$|\.)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Anime [Subgroup] no year, versioned title, hash
new Regex(@"^(?:\[(?<subgroup>.+?)\][-_. ]?)(?<title>(?![(\[]).+?)((v)(?:\d{1,2})(?:([-_. ])))(\[.*)?(?:[\[(][^])])?.*?(?<hash>\[\w{8}\])(?:$|\.)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Anime [Subgroup] no year, info in double sets of brackets, hash
new Regex(@"^(?:\[(?<subgroup>.+?)\][-_. ]?)(?<title>(?![(\[]).+?)(\[.*).*?(?<hash>\[\w{8}\])(?:$|\.)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Anime [Subgroup] no year, info in parentheses or brackets, hash
new Regex(@"^(?:\[(?<subgroup>.+?)\][-_. ]?)(?<title>(?![(\[]).+)(?:[\[(][^])]).*?(?<hash>\[\w{8}\])(?:$|\.)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Some german or french tracker formats (missing year, ...) - see ParserFixture for examples and tests
new Regex(@"^(?<title>(?![(\[]).+?)((\W|_))(" + EditionRegex + @".{1,3})?(?:(?<!(19|20)\d{2}.*?)(?<!(?:Good|The)[_ .-])(German|TrueFrench))(.+?)(?=((19|20)\d{2}|$))(?<year>(19|20)\d{2}(?!p|i|\d+|\]|\W\d+))?(\W+|_|$)(?!\\)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Special, Despecialized, etc. Edition Movies, e.g: Mission.Impossible.3.Special.Edition.2011
new Regex(@"^(?<title>(?![(\[]).+?)?(?:(?:[-_\W](?<![)\[!]))*" + EditionRegex + @".{1,3}(?<year>(1(8|9)|20)\d{2}(?!p|i|\d+|\]|\W\d+)))+(\W+|_|$)(?!\\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
// Normal movie format, e.g: Mission.Impossible.3.2011
new Regex(@"^(?<title>(?![(\[]).+?)?(?:(?:[-_\W](?<![)\[!]))*(?<year>(1(8|9)|20)\d{2}(?!p|i|(1(8|9)|20)\d{2}|\]|\W(1(8|9)|20)\d{2})))+(\W+|_|$)(?!\\)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// PassThePopcorn Torrent names: Star.Wars[PassThePopcorn]
new Regex(@"^(?<title>.+?)?(?:(?:[-_\W](?<![()\[!]))*(?<year>(\[\w *\])))+(\W+|_|$)(?!\\)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// That did not work? Maybe some tool uses [] for years. Who would do that?
new Regex(@"^(?<title>(?![(\[]).+?)?(?:(?:[-_\W](?<![)!]))*(?<year>(1(8|9)|20)\d{2}(?!p|i|\d+|\W\d+)))+(\W+|_|$)(?!\\)", RegexOptions.IgnoreCase | RegexOptions.Compiled),
// As a last resort for movies that have ( or [ in their title.
new Regex(@"^(?<title>.+?)?(?:(?:[-_\W](?<![)\[!]))*(?<year>(1(8|9)|20)\d{2}(?!p|i|\d+|\]|\W\d+)))+(\W+|_|$)(?!\\)", RegexOptions.IgnoreCase | RegexOptions.Compiled)
};
private static readonly Regex[] ReportMovieTitleFolderRegex = new[]
{
// When year comes first.
new Regex(@"^(?:(?:[-_\W](?<![)!]))*(?<year>(19|20)\d{2}(?!p|i|\d+|\W\d+)))+(\W+|_|$)(?<title>.+?)?$")
};
private static readonly Regex[] RejectHashedReleasesRegex = new Regex[]
{
// Generic match for md5 and mixed-case hashes.
new Regex(@"^[0-9a-zA-Z]{32}", RegexOptions.Compiled),
// Generic match for shorter lower-case hashes.
new Regex(@"^[a-z0-9]{24}$", RegexOptions.Compiled),
// Format seen on some NZBGeek releases
new Regex(@"^[A-Z]{11}\d{3}$", RegexOptions.Compiled),
new Regex(@"^[a-z]{12}\d{3}$", RegexOptions.Compiled),
// Backup filename (Unknown origins)
new Regex(@"^Backup_\d{5,}S\d{2}-\d{2}$", RegexOptions.Compiled),
// 123 - Started appearing December 2014
new Regex(@"^123$", RegexOptions.Compiled),
// abc - Started appearing January 2015
new Regex(@"^abc$", RegexOptions.Compiled | RegexOptions.IgnoreCase),
// abc - Started appearing 2020
new Regex(@"^abc[-_. ]xyz", RegexOptions.Compiled | RegexOptions.IgnoreCase),
// b00bs - Started appearing January 2015
new Regex(@"^b00bs$", RegexOptions.Compiled | RegexOptions.IgnoreCase)
};
// Regex to detect whether the title was reversed.
private static readonly Regex ReversedTitleRegex = new(
@"(?:^|[-._ ])(p027|p0801)[-._ ]",
RegexOptions.Compiled);
// Regex to split movie titles that contain `AKA`.
private static readonly Regex AlternativeTitleRegex = new(
@"[ ]+(?:AKA|\/)[ ]+",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
// Regex to unbracket alternative titles.
private static readonly Regex BracketedAlternativeTitleRegex = new(
@"(.*) \([ ]*AKA[ ]+(.*)\)",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex NormalizeAlternativeTitleRegex = new(
@"[ ]+(?:A\.K\.A\.)[ ]+",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex ReportImdbId = new(
@"(?<imdbid>tt\d{7,8})",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex ReportTmdbId = new(
@"tmdb(id)?-(?<tmdbid>\d+)",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly RegexReplace SimpleTitleRegex = new(
@"(?:(480|540|576|720|1080|2160)[ip]|[xh][\W_]?26[45]|DD\W?5\W1|[<>?*]|848x480|1280x720|1920x1080|3840x2160|4096x2160|(8|10)b(it)?|10-bit)\s*?(?![a-b0-9])",
string.Empty,
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex SimpleReleaseTitleRegex = new(
@"\s*(?:[<>?*|])",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex CleanQualityBracketsRegex = new(
@"\[[a-z0-9 ._-]+\]$",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RequestInfoRegex = new(
@"^(?:\[.+?\])+",
RegexOptions.Compiled);
public static ParsedMovieInfo? ParseMovieTitle(string title, bool isDir = false)
{
var originalTitle = title;
try
{
if (!ValidateBeforeParsing(title))
{
return null;
}
if (ReversedTitleRegex.IsMatch(title))
{
var titleWithoutExtension = ParserCommon.RemoveFileExtension(title).ToCharArray();
Array.Reverse(titleWithoutExtension);
title = $"{new string(titleWithoutExtension)}{title.Substring(titleWithoutExtension.Length)}";
}
var releaseTitle = ParserCommon.RemoveFileExtension(title);
// Trim dashes from end
releaseTitle = releaseTitle.Trim('-', '_');
releaseTitle = releaseTitle.Replace("【", "[").Replace("】", "]");
foreach (var replace in ParserCommon.PreSubstitutionRegex)
{
if (replace.TryReplace(ref releaseTitle))
{
break;
}
}
var simpleTitle = SimpleTitleRegex.Replace(releaseTitle);
// Remove website prefixes/postfixes
simpleTitle = ParserCommon.WebsitePrefixRegex.Replace(simpleTitle);
simpleTitle = ParserCommon.WebsitePostfixRegex.Replace(simpleTitle);
simpleTitle = ParserCommon.CleanTorrentSuffixRegex.Replace(simpleTitle);
// Clean quality brackets at the end
simpleTitle = CleanQualityBracketsRegex.Replace(simpleTitle, string.Empty);
var allRegexes = ReportMovieTitleRegex.ToList();
if (isDir)
{
allRegexes.AddRange(ReportMovieTitleFolderRegex);
}
foreach (var regex in allRegexes)
{
var match = regex.Matches(simpleTitle);
if (match.Count != 0)
{
var result = ParseMovieMatchCollection(match);
if (result != null)
{
var simpleReleaseTitle = SimpleReleaseTitleRegex.Replace(releaseTitle, string.Empty);
// Parse edition if not already set
if (string.IsNullOrWhiteSpace(result.Edition))
{
result.Edition = ParseEdition(simpleReleaseTitle);
}
// Parse hash
result.ReleaseHash = GetReleaseHash(match);
// Parse hardcoded subs
result.HardcodedSubs = ParseHardcodeSubs(originalTitle);
// Parse IMDB/TMDB IDs
result.ImdbId = ParseImdbId(simpleReleaseTitle);
result.TmdbId = ParseTmdbId(simpleReleaseTitle);
return result;
}
}
}
}
catch
{
// Parsing failed
}
return null;
}
public static string? ParseImdbId(string title)
{
var match = ReportImdbId.Match(title);
if (match.Success && match.Groups["imdbid"].Success)
{
var imdbId = match.Groups["imdbid"].Value;
if (imdbId.Length == 9 || imdbId.Length == 10)
{
return imdbId;
}
}
return null;
}
public static int ParseTmdbId(string title)
{
var match = ReportTmdbId.Match(title);
if (match.Success && match.Groups["tmdbid"].Success)
{
if (int.TryParse(match.Groups["tmdbid"].Value, out var tmdbId))
{
return tmdbId;
}
}
return 0;
}
public static string? ParseEdition(string title)
{
var editionMatch = ReportEditionRegex.Match(title);
if (editionMatch.Success && editionMatch.Groups["edition"].Success &&
!string.IsNullOrWhiteSpace(editionMatch.Groups["edition"].Value))
{
return editionMatch.Groups["edition"].Value.Replace(".", " ");
}
return null;
}
public static string? ParseHardcodeSubs(string title)
{
var subMatch = HardcodedSubsRegex.Matches(title).LastOrDefault();
if (subMatch != null && subMatch.Success)
{
if (subMatch.Groups["hcsub"].Success)
{
return subMatch.Groups["hcsub"].Value;
}
else if (subMatch.Groups["hc"].Success)
{
return "Generic Hardcoded Subs";
}
}
return null;
}
private static ParsedMovieInfo? ParseMovieMatchCollection(MatchCollection matchCollection)
{
if (!matchCollection[0].Groups["title"].Success || matchCollection[0].Groups["title"].Value == "(")
{
return null;
}
var movieName = matchCollection[0].Groups["title"].Value.Replace('_', ' ');
movieName = NormalizeAlternativeTitleRegex.Replace(movieName, " AKA ");
movieName = RequestInfoRegex.Replace(movieName, "").Trim(' ');
// Handle dots in title - preserve acronyms
var parts = movieName.Split('.');
movieName = "";
var n = 0;
var previousAcronym = false;
var nextPart = "";
foreach (var part in parts)
{
if (parts.Length >= n + 2)
{
nextPart = parts[n + 1];
}
else
{
nextPart = "";
}
if (part.Length == 1 && part.ToLower() != "a" && !int.TryParse(part, out _) &&
(previousAcronym || n < parts.Length - 1) &&
(previousAcronym || nextPart.Length != 1 || !int.TryParse(nextPart, out _)))
{
movieName += part + ".";
previousAcronym = true;
}
else if (part.ToLower() == "a" && (previousAcronym || nextPart.Length == 1))
{
movieName += part + ".";
previousAcronym = true;
}
else if (part.ToLower() == "dr")
{
movieName += part + ".";
previousAcronym = true;
}
else
{
if (previousAcronym)
{
movieName += " ";
previousAcronym = false;
}
movieName += part + " ";
}
n++;
}
movieName = movieName.Trim(' ');
int.TryParse(matchCollection[0].Groups["year"].Value, out var airYear);
var result = new ParsedMovieInfo { Year = airYear };
if (matchCollection[0].Groups["edition"].Success)
{
result.Edition = matchCollection[0].Groups["edition"].Value.Replace(".", " ");
}
var movieTitles = new List<string> { movieName };
// Delete parentheses of the form (aka ...).
var unbracketedName = BracketedAlternativeTitleRegex.Replace(movieName, "$1 AKA $2");
// Split by AKA and filter out empty and duplicate names.
var alternativeTitles = AlternativeTitleRegex
.Split(unbracketedName)
.Where(alternativeName => !string.IsNullOrWhiteSpace(alternativeName) && alternativeName != movieName);
movieTitles.AddRange(alternativeTitles);
result.MovieTitles = movieTitles;
return result;
}
private static bool ValidateBeforeParsing(string title)
{
if (title.ToLower().Contains("password") && title.ToLower().Contains("yenc"))
{
return false;
}
if (!title.Any(char.IsLetterOrDigit))
{
return false;
}
var titleWithoutExtension = ParserCommon.RemoveFileExtension(title);
if (RejectHashedReleasesRegex.Any(v => v.IsMatch(titleWithoutExtension)))
{
return false;
}
return true;
}
private static string? GetReleaseHash(MatchCollection matchCollection)
{
var hash = matchCollection[0].Groups["hash"];
if (hash.Success)
{
var hashValue = hash.Value.Trim('[', ']');
if (hashValue.Equals("1280x720"))
{
return null;
}
return hashValue;
}
return null;
}
}