From 80085e11aa8589b069310bcc1cbdd3eda7afe512 Mon Sep 17 00:00:00 2001 From: Grant Slater Date: Tue, 30 Dec 2025 17:58:26 +0000 Subject: [PATCH] SpamScorer: normalise whitespace in comparable form Collapse Unicode whitespace runs to a single space in SpamScorer::RichText#to_comparable_form to improve SpammyPhrase matching across newlines/tabs. --- lib/spam_scorer/rich_text.rb | 2 +- test/lib/spam_scorer_test.rb | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/spam_scorer/rich_text.rb b/lib/spam_scorer/rich_text.rb index 69d228fea..2c4fee47e 100644 --- a/lib/spam_scorer/rich_text.rb +++ b/lib/spam_scorer/rich_text.rb @@ -38,7 +38,7 @@ module SpamScorer attr_reader :text def to_comparable_form(str) - str.downcase(:fold).unicode_normalize(:nfkc) + str.downcase(:fold).unicode_normalize(:nfkc).gsub(/\s+/u, " ") end end end diff --git a/test/lib/spam_scorer_test.rb b/test/lib/spam_scorer_test.rb index 03e220c91..1759c9197 100644 --- a/test/lib/spam_scorer_test.rb +++ b/test/lib/spam_scorer_test.rb @@ -44,4 +44,19 @@ class SpamScorerTest < ActiveSupport::TestCase scorer = SpamScorer.new_from_rich_text(r) assert_equal 160, scorer.score.round end + + def test_to_comparable_form_collapses_unicode_whitespace + r = RichText.new("text", "x") + scorer = SpamScorer.new_from_rich_text(r) + + input = " A\u00A0\tB\n\nC" + assert_equal " a b c", scorer.send(:to_comparable_form, input) + end + + def test_spammy_phrase_can_match_across_newlines_after_normalization + create(:spammy_phrase, :phrase => "foo bar") + r = RichText.new("markdown", "foo\nbar") + scorer = SpamScorer.new_from_rich_text(r) + assert_equal 40, scorer.score.round + end end -- 2.39.5