From d443819a3c2b1667ef51d8b9cf4864425b87ed22 Mon Sep 17 00:00:00 2001 From: Pablo Brasero Date: Fri, 21 Nov 2025 15:43:45 +0000 Subject: [PATCH] Normalize strings ahead of comparing spammy phrases --- lib/spam_scorer/rich_text.rb | 7 ++++++- test/lib/spam_scorer_test.rb | 14 ++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lib/spam_scorer/rich_text.rb b/lib/spam_scorer/rich_text.rb index c480366f1..69d228fea 100644 --- a/lib/spam_scorer/rich_text.rb +++ b/lib/spam_scorer/rich_text.rb @@ -23,8 +23,9 @@ module SpamScorer link_proportion = link_size.to_f / doc.content.length end + comparable_content = to_comparable_form(doc.content) spammy_phrases = SpammyPhrase.pluck(:phrase).count do |phrase| - doc.content.include?(phrase) + comparable_content.include?(to_comparable_form(phrase)) end ([link_proportion - 0.2, 0.0].max * 200) + @@ -35,5 +36,9 @@ module SpamScorer private attr_reader :text + + def to_comparable_form(str) + str.downcase(:fold).unicode_normalize(:nfkc) + end end end diff --git a/test/lib/spam_scorer_test.rb b/test/lib/spam_scorer_test.rb index 69b0c694c..03e220c91 100644 --- a/test/lib/spam_scorer_test.rb +++ b/test/lib/spam_scorer_test.rb @@ -30,8 +30,18 @@ class SpamScorerTest < ActiveSupport::TestCase def test_spammy_phrases create(:spammy_phrase, :phrase => "Business Description:") create(:spammy_phrase, :phrase => "Additional Keywords:") - r = RichText.new("markdown", "Business Description: totally legit beesknees. Additional Keywords: apiary joints") + create(:spammy_phrase, :phrase => "Große Preise") + create(:spammy_phrase, :phrase => "Relevant Experience:") + r = RichText.new( + "markdown", + <<~SPAM + BUSINESS DESCRIPTION: totally legit beesknees. + Additional Keywords: apiary joints. + GROSSE PREISE: VIEL GELD. + RELEVANT EⅩPERIENCE: disguising latin characters as roman numerals. + SPAM + ) scorer = SpamScorer.new_from_rich_text(r) - assert_equal 80, scorer.score.round + assert_equal 160, scorer.score.round end end -- 2.39.5