diff --git a/src/Albatross.Expression.Test/Operations/TextOperationTests.cs b/src/Albatross.Expression.Test/Operations/TextOperationTests.cs index f8b63ab..b9d8b93 100644 --- a/src/Albatross.Expression.Test/Operations/TextOperationTests.cs +++ b/src/Albatross.Expression.Test/Operations/TextOperationTests.cs @@ -85,11 +85,22 @@ public class TextOperationTests [TestCase("contains(\"Option one\", \"Option one\")", ExpectedResult = true)] [TestCase("contains(\"Option one\", \"Option two\")", ExpectedResult = false)] + // Word Count markdown text + [TestCase("wordCount(\"**123**\")", ExpectedResult = 1)] + [TestCase("wordCount(\"_123_\")", ExpectedResult = 1)] + [TestCase("wordCount(\"**__123 123__**\")", ExpectedResult = 2)] + [TestCase("wordCount(\"### H3\\n\\nH2\\n--\\n\\nH1\\n==\")", ExpectedResult = 3)] + [TestCase("wordCount(\"**Bold **_Italic_ ~~StrikeThrough~~ **_~~BoldItalicStrikeThrough~~_****_ BoldItalic_** _~~ItalicStrikeThrough~~__ _**~~BoldStrikeThrough~~**\")", ExpectedResult = 7)] + [TestCase("wordCount(\"> Block Quote\\n\\n`Code`\\n\\n Code Block\\n\\nEmoji \\n\\n[Link](https://stackedit.io/app#)\")", ExpectedResult = 7)] + [TestCase("wordCount(\"1. LevelOne\\n\\n2. LevelTwo\\n\\n\\n\\n* LevelOne\\n\\n* LevelTwo\")", ExpectedResult = 4)] + [TestCase("wordCount(\"1. **Bold**\\n\\n2. **_BoldItalic_**\\n\\n3. **_~~BoldItalicStrikeThrough~~_**\\n\\n4. [Link](http://asdasdasdad@dasfv/asdqw)\\n\\n5. Emoji 2 \ud83d\ude07 \ud83d\ude17\")", ExpectedResult = 6)] + // Not Contains [TestCase("NotContains(\"Option One\", \"Option two\")", ExpectedResult = true)] [TestCase("NotContains(\"Option One\", \"Option One\")", ExpectedResult = false)] [TestCase("NotContains(\"[Option One, Option two]\", \"Option One\")", ExpectedResult = false)] - [TestCase("NotContains(\"[Option One, Option two]\", \"Option three\")", ExpectedResult = true)] + [TestCase("NotContains(\"[Option One, Option two]\", \"Option three\")", ExpectedResult = true)] + [TestCase("wordCount(\"1. **Bold**\\n\\n2. **_BoldItalic_**\\n\\n3. **_~~BoldItalicStrikeThrough~~_**\\n\\n4. [Link](http://asdasdasdad@dasfv/asdqw)\\n\\n5. Emoji 2 \ud83d\ude07 \ud83d\ude17\")", ExpectedResult = 6)] // In [TestCase("In(\"Option one\", \"[Option one, Option two]\")", ExpectedResult = true)] @@ -106,6 +117,25 @@ public class TextOperationTests [TestCase("NotIn(\"[Option one, Option two]\", \"[Option one, Option two, Option three]\")", ExpectedResult = false)] [TestCase("NotIn(\"Option two\", \"Option two\")", ExpectedResult = false)] [TestCase("NotIn(\"Option three\", \"Option two\")", ExpectedResult = true)] + // Char Count + [TestCase("charCount(\"123\")", ExpectedResult = 3)] + [TestCase("charCount(\"123 123\")", ExpectedResult = 6)] + [TestCase("charCount(\"Word\")", ExpectedResult = 4)] + [TestCase("charCount(\"C Sharp\")", ExpectedResult = 6)] + + // Char Count markdown text + [TestCase("charCount(\"**123**\")", ExpectedResult = 3)] + [TestCase("charCount(\"_123_\")", ExpectedResult = 3)] + [TestCase("charCount(\"**__123 123__**\")", ExpectedResult = 6)] + [TestCase("charCount(\"### H3\\n\\nH2\\n--\\n\\nH1\\n==\")", ExpectedResult = 6)] + [TestCase("charCount(\"**Bold **_Italic_ ~~StrikeThrough~~ **_~~BoldItalicStrikeThrough~~_****_ BoldItalic_** _~~ItalicStrikeThrough~~__ _**~~BoldStrikeThrough~~**\")", ExpectedResult = 96)] + [TestCase("charCount(\"> Block Quote\\n\\n`Code`\\n\\n Code Block\\n\\nEmoji \\n\\n[Link](https://stackedit.io/app#)\")", ExpectedResult = 32)] + [TestCase("charCount(\"1. LevelOne\\n\\n2. LevelTwo\\n\\n\\n\\n* LevelOne\\n\\n* LevelTwo\")", ExpectedResult = 32)] + [TestCase("charCount(\"1. **Bold**\\n\\n2. **_BoldItalic_**\\n\\n3. **_~~BoldItalicStrikeThrough~~_**\\n\\n4. [Link](http://asdasdasdad@dasfv/asdqw)\\n\\n5. Emoji 2 \ud83d\ude07 \ud83d\ude17\")", ExpectedResult = 47)] + [TestCase("charCount(\"[IRFAN](https://testqa.workiom.club/app/apps/b226f5a6-7a1a-41be-ac89-546bd3084bf5/list/1f815325-d905-49e5-8976-c26dafff4f89/394417) IRFAN\")", ExpectedResult = 10)] + [TestCase("charCount(\"IRFAN 💫\")", ExpectedResult = 5)] + [TestCase("charCount(\"IRFAN\\n=====\")", ExpectedResult = 5)] + [TestCase("charCount(\"> IRFAN\")", ExpectedResult = 5)] public object OperationsTesting(string expression) { return Factory.Instance.Create().Compile(expression).EvalValue(null); diff --git a/src/Albatross.Expression/Extension.cs b/src/Albatross.Expression/Extension.cs index 05bfbf3..91d1218 100644 --- a/src/Albatross.Expression/Extension.cs +++ b/src/Albatross.Expression/Extension.cs @@ -125,53 +125,44 @@ public static IToken Compile(this IParser parser, string expression) #endregion #region String - + + private static readonly Regex MarkdownRegex = new Regex(@"^(#|##|###|####|#####|######|\*{1,3}|_{1,3}|\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)|\d+\.\s|```[\s\S]+?```|\|.*\|.*\|)|(!\[[^\]]*\]\([^\)]*\)|\[[^\]]*\]\([^\)]*\)|\*\*.*\*\*|__.*__|\*.*\*|_.*_|`[^`]*`|\[.*\]\(.*\)|<.*>)", RegexOptions.Multiline | RegexOptions.Compiled); + private static readonly Regex CodeTagRegex = new Regex(@"(.*?)<\/code>", RegexOptions.Compiled); + private static readonly Regex HeaderEqualsRegex = new Regex(@"\n[=\-]+", RegexOptions.Compiled); + private static readonly Regex BoldRegex = new Regex(@"\*\*(.+?)\*\*", RegexOptions.Multiline | RegexOptions.Compiled); + private static readonly Regex ItalicRegex = new Regex(@"(?)$"; - - Regex regex = new Regex(markdownPattern, RegexOptions.Multiline); + text = text.Replace("\\n", "\n"); - if (!regex.IsMatch(text)) - { - normalizedText = null; + if (!IsMarkdown(text)) return false; - } + + text = text.StripEmojis(); - // Convert it to Html and then to plain text - text = ConvertHtmlToPlainText(CommonMark.CommonMarkConverter.Convert(text)); + var html = ConvertMarkdownToHtml(text); + var plainText = ConvertHtmlToPlainText(html); + plainText = StripMarkdown(plainText); + normalizedText = plainText.Trim(); - // Remove all markdown tags - text = Regex.Replace(text, "\n=+", ""); // Headers - text = Regex.Replace(text, @"\*\*(.*?)\*\*", "$1", RegexOptions.Multiline); // Remove bold tags - text = Regex.Replace(text, @"\*(.*?)\*", "$1", RegexOptions.Multiline); // Remove italic tags - text = Regex.Replace(text, @"~~(.*?)~~", "$1", RegexOptions.Multiline); // Remove strikethrough tags - text = Regex.Replace(text, @"`(.*?)`", "$1", RegexOptions.Multiline); // Remove inline code tags - text = Regex.Replace(text, @"\[(.*?)\]\(.*?\)", "$1", RegexOptions.Multiline); // Remove link tags - text = Regex.Replace(text, @"!\[(.*?)\]\(.*?\)", "$1", RegexOptions.Multiline); // Remove image tags - text = Regex.Replace(text, @"^#+\s+", "", RegexOptions.Multiline); // Remove heading tags - text = Regex.Replace(text, @"\n[*-]\s+", ""); // Remove list item tags - text = Regex.Replace(text, @"\n\d+\.\s+", ""); // Remove ordered list item tags - text = Regex.Replace(text, @"^-{3,}", "", RegexOptions.Multiline); // Remove horizontal rule tags - text = Regex.Replace(text, @"`{3}[\s\S]*?`{3}", "", RegexOptions.Multiline); // Remove code block tags - text = Regex.Replace(text, @"\|.*?\|(\n\|.*?\|)*", ""); // Remove table tags - - // Replace escape sequences with white space - text = Regex.Replace(text, @"\\[abfnrt\'\""\0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}", - " "); - - // Add regex to remove escape sequences - text = Regex.Replace(text, @"\\[nt\""']+", ""); - - normalizedText = text.Trim(); // Trim leading and trailing spaces - - return true; + return normalizedText != null; } - + #endregion #region Utilities @@ -183,16 +174,65 @@ public static object GetOperandsAt(this List operands, int index) return null; } + + private static bool IsMarkdown(this string text) + { + return MarkdownRegex.IsMatch(text); + } - private static string ConvertHtmlToPlainText(string html) + private static string ConvertHtmlToPlainText(this string html) { var doc = new HtmlDocument(); doc.LoadHtml(html); - // Use HtmlAgilityPack to extract text from HTML return doc.DocumentNode.InnerText; } + + private static string ConvertMarkdownToHtml(this string markdown) + { + var html = CommonMark.CommonMarkConverter.Convert(markdown); + return CodeTagRegex.Replace(html, " $1 "); + } + + private static string StripMarkdown(this string markdown) + { + var text = markdown; + text = CodeBlockRegex.Replace(text, ""); + text = InlineCodeRegex.Replace(text, "$1"); + + text = ImageRegex.Replace(text, m => { + string altText = m.Groups[1].Value.Trim(); + return string.IsNullOrWhiteSpace(altText) ? "" : altText; + }); + + text = LinkRegex.Replace(text, m => { + string linkText = m.Groups[1].Value.Trim(); + return string.IsNullOrWhiteSpace(linkText) ? "" : linkText; + }); + + text = StrikethroughRegex.Replace(text, "$1"); + text = BoldRegex.Replace(text, "$1"); + text = ItalicRegex.Replace(text, "$1"); + text = HeaderEqualsRegex.Replace(text, ""); + text = HeadingRegex.Replace(text, ""); + text = HorizontalRuleRegex.Replace(text, ""); + text = ListItemRegex.Replace(text, ""); + text = OrderedListRegex.Replace(text, ""); + text = TableRegex.Replace(text, ""); + text = EscapeSequenceRegex.Replace(text, " "); + text = EscapeCharsRegex.Replace(text, ""); + + return text; + } + + private static string StripEmojis(this string text) + { + var emojiPattern = "\\uD83D(?:\\uDC68(?:\\uD83C(?:[\\uDFFB-\\uDFFF]\\u200D(?:\\u2764\\uFE0F?\\u200D\\uD83D(?:\\uDC8B\\u200D\\uD83D)?\\uDC68\\uD83C[\\uDFFB-\\uDFFF]|\\uD83E(?:\\uDD1D\\u200D\\uD83D\\uDC68\\uD83C[\\uDFFB-\\uDFFF]|[\\uDDAF-\\uDDB3\\uDDBC\\uDDBD])|[\\u2695\\u2696\\u2708]\\uFE0F?|\\uD83C[\\uDF3E\\uDF73\\uDF7C\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|[\\uDFFB-\\uDFFF])|\\u200D(?:\\u2764\\uFE0F?\\u200D\\uD83D(?:\\uDC8B\\u200D\\uD83D)?\\uDC68|\\uD83D(?:(?:[\\uDC68\\uDC69]\\u200D\\uD83D)?(?:\\uDC66(?:\\u200D\\uD83D\\uDC66)?|\\uDC67(?:\\u200D\\uD83D[\\uDC66\\uDC67])?)|[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|[\\u2695\\u2696\\u2708]\\uFE0F?|\\uD83C[\\uDF3E\\uDF73\\uDF7C\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83E[\\uDDAF-\\uDDB3\\uDDBC\\uDDBD]))?|\\uDC69(?:\\uD83C(?:[\\uDFFB-\\uDFFF]\\u200D(?:\\u2764\\uFE0F?\\u200D\\uD83D(?:\\uDC8B\\u200D\\uD83D[\\uDC68\\uDC69]|[\\uDC68\\uDC69])\\uD83C[\\uDFFB-\\uDFFF]|\\uD83E(?:\\uDD1D\\u200D\\uD83D[\\uDC68\\uDC69]\\uD83C[\\uDFFB-\\uDFFF]|[\\uDDAF-\\uDDB3\\uDDBC\\uDDBD])|[\\u2695\\u2696\\u2708]\\uFE0F?|\\uD83C[\\uDF3E\\uDF73\\uDF7C\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|[\\uDFFB-\\uDFFF])|\\u200D(?:\\u2764\\uFE0F?\\u200D\\uD83D(?:\\uDC8B\\u200D\\uD83D[\\uDC68\\uDC69]|[\\uDC68\\uDC69])|\\uD83D(?:(?:\\uDC69\\u200D\\uD83D)?(?:\\uDC66(?:\\u200D\\uD83D\\uDC66)?|\\uDC67(?:\\u200D\\uD83D[\\uDC66\\uDC67])?)|[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|[\\u2695\\u2696\\u2708]\\uFE0F?|\\uD83C[\\uDF3E\\uDF73\\uDF7C\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83E[\\uDDAF-\\uDDB3\\uDDBC\\uDDBD]))?|(?:\\uDD75(?:\\uD83C[\\uDFFB-\\uDFFF]|\\uFE0F)?|\\uDC6F)(?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|[\\uDC6E\\uDC70\\uDC71\\uDC73\\uDC77\\uDC81\\uDC82\\uDC86\\uDC87\\uDE45-\\uDE47\\uDE4B\\uDE4D\\uDE4E\\uDEA3\\uDEB4-\\uDEB6](?:\\uD83C[\\uDFFB-\\uDFFF](?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\uDC41(?:\\uFE0F(?:\\u200D\\uD83D\\uDDE8\\uFE0F?)?|\\u200D\\uD83D\\uDDE8\\uFE0F?)?|\\uDE36(?:\\u200D\\uD83C\\uDF2B\\uFE0F?)?|\\uDC15(?:\\u200D\\uD83E\\uDDBA)?|\\uDC3B(?:\\u200D\\u2744\\uFE0F?)?|\\uDE2E(?:\\u200D\\uD83D\\uDCA8)?|\\uDE35(?:\\u200D\\uD83D\\uDCAB)?|[\\uDC42\\uDC43\\uDC46-\\uDC50\\uDC66\\uDC67\\uDC6B-\\uDC6D\\uDC72\\uDC74-\\uDC76\\uDC78\\uDC7C\\uDC83\\uDC85\\uDC8F\\uDC91\\uDCAA\\uDD7A\\uDD95\\uDD96\\uDE4C\\uDE4F\\uDEC0\\uDECC](?:\\uD83C[\\uDFFB-\\uDFFF])?|[\\uDD74\\uDD90]\\uD83C[\\uDFFB-\\uDFFF]|\\uDC08(?:\\u200D\\u2B1B)?|[\\uDC3F\\uDCFD\\uDD49\\uDD4A\\uDD6F\\uDD70\\uDD73\\uDD74\\uDD76-\\uDD79\\uDD87\\uDD8A-\\uDD8D\\uDD90\\uDDA5\\uDDA8\\uDDB1\\uDDB2\\uDDBC\\uDDC2-\\uDDC4\\uDDD1-\\uDDD3\\uDDDC-\\uDDDE\\uDDE1\\uDDE3\\uDDE8\\uDDEF\\uDDF3\\uDDFA\\uDECB\\uDECD-\\uDECF\\uDEE0-\\uDEE5\\uDEE9\\uDEF0\\uDEF3]\\uFE0F?|[\\uDC00-\\uDC07\\uDC09-\\uDC14\\uDC16-\\uDC3A\\uDC3C-\\uDC3E\\uDC40\\uDC44\\uDC45\\uDC51-\\uDC65\\uDC6A\\uDC79-\\uDC7B\\uDC7D-\\uDC80\\uDC84\\uDC88-\\uDC8E\\uDC90\\uDC92-\\uDCA9\\uDCAB-\\uDCFC\\uDCFF-\\uDD3D\\uDD4B-\\uDD4E\\uDD50-\\uDD67\\uDDA4\\uDDFB-\\uDE2D\\uDE2F-\\uDE34\\uDE37-\\uDE44\\uDE48-\\uDE4A\\uDE80-\\uDEA2\\uDEA4-\\uDEB3\\uDEB7-\\uDEBF\\uDEC1-\\uDEC5\\uDED0-\\uDED2\\uDED5-\\uDED7\\uDEEB\\uDEEC\\uDEF4-\\uDEFC\\uDFE0-\\uDFEB])|\\uD83E(?:\\uDDD1(?:\\uD83C(?:[\\uDFFB-\\uDFFF]\\u200D(?:\\u2764\\uFE0F?\\u200D(?:\\uD83D\\uDC8B\\u200D)?\\uD83E\\uDDD1\\uD83C[\\uDFFB-\\uDFFF]|\\uD83E(?:\\uDD1D\\u200D\\uD83E\\uDDD1\\uD83C[\\uDFFB-\\uDFFF]|[\\uDDAF-\\uDDB3\\uDDBC\\uDDBD])|[\\u2695\\u2696\\u2708]\\uFE0F?|\\uD83C[\\uDF3E\\uDF73\\uDF7C\\uDF84\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92])|[\\uDFFB-\\uDFFF])|\\u200D(?:\\uD83E(?:\\uDD1D\\u200D\\uD83E\\uDDD1|[\\uDDAF-\\uDDB3\\uDDBC\\uDDBD])|[\\u2695\\u2696\\u2708]\\uFE0F?|\\uD83C[\\uDF3E\\uDF73\\uDF7C\\uDF84\\uDF93\\uDFA4\\uDFA8\\uDFEB\\uDFED]|\\uD83D[\\uDCBB\\uDCBC\\uDD27\\uDD2C\\uDE80\\uDE92]))?|[\\uDD26\\uDD35\\uDD37-\\uDD39\\uDD3D\\uDD3E\\uDDB8\\uDDB9\\uDDCD-\\uDDCF\\uDDD4\\uDDD6-\\uDDDD](?:\\uD83C[\\uDFFB-\\uDFFF](?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\u200D[\\u2640\\u2642]\\uFE0F?)?|[\\uDD3C\\uDDDE\\uDDDF](?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|[\\uDD0C\\uDD0F\\uDD18-\\uDD1C\\uDD1E\\uDD1F\\uDD30-\\uDD34\\uDD36\\uDD77\\uDDB5\\uDDB6\\uDDBB\\uDDD2\\uDDD3\\uDDD5](?:\\uD83C[\\uDFFB-\\uDFFF])?|[\\uDD0D\\uDD0E\\uDD10-\\uDD17\\uDD1D\\uDD20-\\uDD25\\uDD27-\\uDD2F\\uDD3A\\uDD3F-\\uDD45\\uDD47-\\uDD76\\uDD78\\uDD7A-\\uDDB4\\uDDB7\\uDDBA\\uDDBC-\\uDDCB\\uDDD0\\uDDE0-\\uDDFF\\uDE70-\\uDE74\\uDE78-\\uDE7A\\uDE80-\\uDE86\\uDE90-\\uDEA8\\uDEB0-\\uDEB6\\uDEC0-\\uDEC2\\uDED0-\\uDED6])|\\uD83C(?:\\uDFF4(?:\\uDB40\\uDC67\\uDB40\\uDC62\\uDB40(?:\\uDC65\\uDB40\\uDC6E\\uDB40\\uDC67|\\uDC73\\uDB40\\uDC63\\uDB40\\uDC74|\\uDC77\\uDB40\\uDC6C\\uDB40\\uDC73)\\uDB40\\uDC7F|\\u200D\\u2620\\uFE0F?)?|[\\uDFC3\\uDFC4\\uDFCA](?:\\uD83C[\\uDFFB-\\uDFFF](?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\u200D[\\u2640\\u2642]\\uFE0F?)?|[\\uDFCB\\uDFCC](?:\\uD83C[\\uDFFB-\\uDFFF]|\\uFE0F)(?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\uDFF3(?:\\uFE0F(?:\\u200D(?:\\u26A7\\uFE0F?|\\uD83C\\uDF08))?|\\u200D(?:\\u26A7\\uFE0F?|\\uD83C\\uDF08))?|(?:[\\uDFCB\\uDFCC]\\u200D[\\u2640\\u2642]|[\\uDD70\\uDD71\\uDD7E\\uDD7F\\uDE02\\uDE37\\uDF21\\uDF24-\\uDF2C\\uDF36\\uDF7D\\uDF96\\uDF97\\uDF99-\\uDF9B\\uDF9E\\uDF9F\\uDFCD\\uDFCE\\uDFD4-\\uDFDF\\uDFF5\\uDFF7])\\uFE0F?|[\\uDF85\\uDFC2\\uDFC7](?:\\uD83C[\\uDFFB-\\uDFFF])?|\\uDDE6\\uD83C[\\uDDE8-\\uDDEC\\uDDEE\\uDDF1\\uDDF2\\uDDF4\\uDDF6-\\uDDFA\\uDDFC\\uDDFD\\uDDFF]|\\uDDE7\\uD83C[\\uDDE6\\uDDE7\\uDDE9-\\uDDEF\\uDDF1-\\uDDF4\\uDDF6-\\uDDF9\\uDDFB\\uDDFC\\uDDFE\\uDDFF]|\\uDDE8\\uD83C[\\uDDE6\\uDDE8\\uDDE9\\uDDEB-\\uDDEE\\uDDF0-\\uDDF5\\uDDF7\\uDDFA-\\uDDFF]|\\uDDE9\\uD83C[\\uDDEA\\uDDEC\\uDDEF\\uDDF0\\uDDF2\\uDDF4\\uDDFF]|\\uDDEA\\uD83C[\\uDDE6\\uDDE8\\uDDEA\\uDDEC\\uDDED\\uDDF7-\\uDDFA]|\\uDDEB\\uD83C[\\uDDEE-\\uDDF0\\uDDF2\\uDDF4\\uDDF7]|\\uDDEC\\uD83C[\\uDDE6\\uDDE7\\uDDE9-\\uDDEE\\uDDF1-\\uDDF3\\uDDF5-\\uDDFA\\uDDFC\\uDDFE]|\\uDDED\\uD83C[\\uDDF0\\uDDF2\\uDDF3\\uDDF7\\uDDF9\\uDDFA]|\\uDDEE\\uD83C[\\uDDE8-\\uDDEA\\uDDF1-\\uDDF4\\uDDF6-\\uDDF9]|\\uDDEF\\uD83C[\\uDDEA\\uDDF2\\uDDF4\\uDDF5]|\\uDDF0\\uD83C[\\uDDEA\\uDDEC-\\uDDEE\\uDDF2\\uDDF3\\uDDF5\\uDDF7\\uDDFC\\uDDFE\\uDDFF]|\\uDDF1\\uD83C[\\uDDE6-\\uDDE8\\uDDEE\\uDDF0\\uDDF7-\\uDDFB\\uDDFE]|\\uDDF2\\uD83C[\\uDDE6\\uDDE8-\\uDDED\\uDDF0-\\uDDFF]|\\uDDF3\\uD83C[\\uDDE6\\uDDE8\\uDDEA-\\uDDEC\\uDDEE\\uDDF1\\uDDF4\\uDDF5\\uDDF7\\uDDFA\\uDDFF]|\\uDDF4\\uD83C\\uDDF2|\\uDDF5\\uD83C[\\uDDE6\\uDDEA-\\uDDED\\uDDF0-\\uDDF3\\uDDF7-\\uDDF9\\uDDFC\\uDDFE]|\\uDDF6\\uD83C\\uDDE6|\\uDDF7\\uD83C[\\uDDEA\\uDDF4\\uDDF8\\uDDFA\\uDDFC]|\\uDDF8\\uD83C[\\uDDE6-\\uDDEA\\uDDEC-\\uDDF4\\uDDF7-\\uDDF9\\uDDFB\\uDDFD-\\uDDFF]|\\uDDF9\\uD83C[\\uDDE6\\uDDE8\\uDDE9\\uDDEB-\\uDDED\\uDDEF-\\uDDF4\\uDDF7\\uDDF9\\uDDFB\\uDDFC\\uDDFF]|\\uDDFA\\uD83C[\\uDDE6\\uDDEC\\uDDF2\\uDDF3\\uDDF8\\uDDFE\\uDDFF]|\\uDDFB\\uD83C[\\uDDE6\\uDDE8\\uDDEA\\uDDEC\\uDDEE\\uDDF3\\uDDFA]|\\uDDFC\\uD83C[\\uDDEB\\uDDF8]|\\uDDFD\\uD83C\\uDDF0|\\uDDFE\\uD83C[\\uDDEA\\uDDF9]|\\uDDFF\\uD83C[\\uDDE6\\uDDF2\\uDDFC]|[\\uDC04\\uDCCF\\uDD8E\\uDD91-\\uDD9A\\uDE01\\uDE1A\\uDE2F\\uDE32-\\uDE36\\uDE38-\\uDE3A\\uDE50\\uDE51\\uDF00-\\uDF20\\uDF2D-\\uDF35\\uDF37-\\uDF7C\\uDF7E-\\uDF84\\uDF86-\\uDF93\\uDFA0-\\uDFC1\\uDFC5\\uDFC6\\uDFC8\\uDFC9\\uDFCB\\uDFCC\\uDFCF-\\uDFD3\\uDFE0-\\uDFF0\\uDFF8-\\uDFFF])|\\u26F9(?:(?:\\uD83C[\\uDFFB-\\uDFFF]|\\uFE0F)(?:\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\u200D[\\u2640\\u2642]\\uFE0F?)?|\\u2764(?:\\uFE0F(?:\\u200D(?:\\uD83D\\uDD25|\\uD83E\\uDE79))?|\\u200D(?:\\uD83D\\uDD25|\\uD83E\\uDE79))?|[\\#\\*0-9]\\uFE0F?\\u20E3|[\\u261D\\u270C\\u270D]\\uD83C[\\uDFFB-\\uDFFF]|[\\u270A\\u270B](?:\\uD83C[\\uDFFB-\\uDFFF])?|[\\u00A9\\u00AE\\u203C\\u2049\\u2122\\u2139\\u2194-\\u2199\\u21A9\\u21AA\\u2328\\u23CF\\u23ED-\\u23EF\\u23F1\\u23F2\\u23F8-\\u23FA\\u24C2\\u25AA\\u25AB\\u25B6\\u25C0\\u25FB\\u25FC\\u2600-\\u2604\\u260E\\u2611\\u2618\\u261D\\u2620\\u2622\\u2623\\u2626\\u262A\\u262E\\u262F\\u2638-\\u263A\\u2640\\u2642\\u265F\\u2660\\u2663\\u2665\\u2666\\u2668\\u267B\\u267E\\u2692\\u2694-\\u2697\\u2699\\u269B\\u269C\\u26A0\\u26A7\\u26B0\\u26B1\\u26C8\\u26CF\\u26D1\\u26D3\\u26E9\\u26F0\\u26F1\\u26F4\\u26F7\\u26F8\\u2702\\u2708\\u2709\\u270C\\u270D\\u270F\\u2712\\u2714\\u2716\\u271D\\u2721\\u2733\\u2734\\u2744\\u2747\\u2763\\u27A1\\u2934\\u2935\\u2B05-\\u2B07\\u3030\\u303D\\u3297\\u3299]\\uFE0F?|[\\u231A\\u231B\\u23E9-\\u23EC\\u23F0\\u23F3\\u25FD\\u25FE\\u2614\\u2615\\u2648-\\u2653\\u267F\\u2693\\u26A1\\u26AA\\u26AB\\u26BD\\u26BE\\u26C4\\u26C5\\u26CE\\u26D4\\u26EA\\u26F2\\u26F3\\u26F5\\u26FA\\u26FD\\u2705\\u2728\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2795-\\u2797\\u27B0\\u27BF\\u2B1B\\u2B1C\\u2B50\\u2B55]\n"; + text = Regex.Replace(text, emojiPattern, " "); + return text; + } + #endregion } } diff --git a/src/Albatross.Expression/Functions/Text/CharCount.cs b/src/Albatross.Expression/Functions/Text/CharCount.cs new file mode 100644 index 0000000..7202815 --- /dev/null +++ b/src/Albatross.Expression/Functions/Text/CharCount.cs @@ -0,0 +1,68 @@ +using Albatross.Expression.Documentation.Attributes; +using Albatross.Expression.Exceptions; +using Albatross.Expression.Tokens; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using Group = Albatross.Expression.Documentation.Group; + +namespace Albatross.Expression.Functions.Text +{ + [FunctionDoc(Group.Text, "{token}( )", + @"### Returns the number of characters in a string. +#### Inputs: +- string: String +#### Outputs: +- Integer" + )] + [ParserOperation] + public class CharCount : PrefixOperationToken + { + public override string Name => "CharCount"; + public override int MinOperandCount => 1; + public override int MaxOperandCount => 1; + public override bool Symbolic => false; + + public override object EvalValue(Func context) + { + List list = GetOperands(context); + + object value = list[0]; + switch (value) + { + case null: + return null; + case string s: + return CountCharacters(s); + case ICollection collection: + return Convert.ToDouble(collection.Count); + default: + throw new UnexpectedTypeException(value.GetType()); + } + } + + private static double CountCharacters(string text) + { + if (string.IsNullOrEmpty(text)) + return 0; + + text = text.Trim(); + + var result = text.TryNormalizeText(out string normalizedText); + if (result) + text = normalizedText; + + // Count ONLY alphanumeric characters (letters A-Z, a-z and numbers 0-9) + // Per requirements: Do NOT count apostrophes, spaces, punctuation, or special symbols + int count = 0; + foreach (var c in text) + { + if (char.IsLetter(c) || char.IsDigit(c)) + count++; + } + + return count; + } + } +} \ No newline at end of file diff --git a/src/Albatross.Expression/Functions/Text/WordCount.cs b/src/Albatross.Expression/Functions/Text/WordCount.cs new file mode 100644 index 0000000..7e47b01 --- /dev/null +++ b/src/Albatross.Expression/Functions/Text/WordCount.cs @@ -0,0 +1,84 @@ +using Albatross.Expression.Documentation.Attributes; +using Albatross.Expression.Exceptions; +using Albatross.Expression.Tokens; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using Group = Albatross.Expression.Documentation.Group; + +namespace Albatross.Expression.Operations +{ + [FunctionDoc(Group.Text, "{token}( )", +@"### Returns the number of words in a string. +#### Inputs: +- string: String +#### Outputs: +- Integer" + )] + [ParserOperation] + public class WordCount : PrefixOperationToken + { + + public override string Name => "WordCount"; + public override int MinOperandCount => 1; + public override int MaxOperandCount => 1; + public override bool Symbolic => false; + + static readonly Regex WordPattern = new Regex(@"\b[\w']+\b", RegexOptions.Compiled); + + public override object EvalValue(Func context) + { + List list = GetOperands(context); + + object value = list[0]; + switch (value) + { + case null: + return null; + case string s: + return CountWords(s); + case ICollection collection: + return Convert.ToDouble(collection.Count); + default: + throw new UnexpectedTypeException(value.GetType()); + } + } + + private static double CountWords(string text) + { + if (string.IsNullOrEmpty(text)) + return 0; + + text = text.Trim(); + + var result = text.TryNormalizeText(out string normalizedText); + if (result) + text = normalizedText; + + // Collapse all whitespace to single spaces + text = Regex.Replace(text, @"\s+", " "); + text = text.Trim(); + + if (string.IsNullOrEmpty(text)) + return 0; + + MatchCollection matches = WordPattern.Matches(text); + + // Filter out matches that are only apostrophes or punctuation + int count = 0; + foreach (Match match in matches) + { + string word = match.Value; + // Only count if the word contains at least one alphanumeric character + if (word.Any(char.IsLetterOrDigit)) + { + count++; + } + } + + return count; + } + } +}