From 4f2a6e4205b88f9d6d9c8901edd155c9df051ab9 Mon Sep 17 00:00:00 2001 From: Pablo Brasero Date: Thu, 20 Nov 2025 11:55:27 +0000 Subject: [PATCH] Move into submodule ahead of new type of scorer --- lib/spam_scorer.rb | 36 +++------------------------------ lib/spam_scorer/rich_text.rb | 39 ++++++++++++++++++++++++++++++++++++ test/lib/spam_scorer_test.rb | 6 +++--- 3 files changed, 45 insertions(+), 36 deletions(-) create mode 100644 lib/spam_scorer/rich_text.rb diff --git a/lib/spam_scorer.rb b/lib/spam_scorer.rb index 61a62ca0f..546a7d824 100644 --- a/lib/spam_scorer.rb +++ b/lib/spam_scorer.rb @@ -1,41 +1,11 @@ # frozen_string_literal: true -class SpamScorer +module SpamScorer SPAMMY_PHRASES = [ "Business Description:", "Additional Keywords:" ].freeze - def initialize(text) - @text = text + def self.new_from_rich_text(text) + self::RichText.new(text) end - - def score - link_count = 0 - link_size = 0 - - doc = Nokogiri::HTML(text.to_html) - - if doc.content.empty? - link_proportion = 0 - else - doc.xpath("//a").each do |link| - link_count += 1 - link_size += link.content.length - end - - link_proportion = link_size.to_f / doc.content.length - end - - spammy_phrases = SPAMMY_PHRASES.count do |phrase| - doc.content.include?(phrase) - end - - ([link_proportion - 0.2, 0.0].max * 200) + - (link_count * 40) + - (spammy_phrases * 40) - end - - private - - attr_reader :text end diff --git a/lib/spam_scorer/rich_text.rb b/lib/spam_scorer/rich_text.rb new file mode 100644 index 000000000..5f53fb3df --- /dev/null +++ b/lib/spam_scorer/rich_text.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true + +module SpamScorer + class RichText + def initialize(text) + @text = text + end + + def score + link_count = 0 + link_size = 0 + + doc = Nokogiri::HTML(text.to_html) + + if doc.content.empty? + link_proportion = 0 + else + doc.xpath("//a").each do |link| + link_count += 1 + link_size += link.content.length + end + + link_proportion = link_size.to_f / doc.content.length + end + + spammy_phrases = SPAMMY_PHRASES.count do |phrase| + doc.content.include?(phrase) + end + + ([link_proportion - 0.2, 0.0].max * 200) + + (link_count * 40) + + (spammy_phrases * 40) + end + + private + + attr_reader :text + end +end diff --git a/test/lib/spam_scorer_test.rb b/test/lib/spam_scorer_test.rb index bb1801a53..a8a17a7f6 100644 --- a/test/lib/spam_scorer_test.rb +++ b/test/lib/spam_scorer_test.rb @@ -5,19 +5,19 @@ require "test_helper" class SpamScorerTest < ActiveSupport::TestCase def test_html_spam_score r = RichText.new("html", "foo bar baz") - scorer = SpamScorer.new(r) + scorer = SpamScorer.new_from_rich_text(r) assert_equal 55, scorer.score.round end def test_markdown_spam_score r = RichText.new("markdown", "foo [bar](http://example.com/) baz") - scorer = SpamScorer.new(r) + scorer = SpamScorer.new_from_rich_text(r) assert_equal 50, scorer.score.round end def test_text_spam_score r = RichText.new("text", "foo http://example.com/ bar") - scorer = SpamScorer.new(r) + scorer = SpamScorer.new_from_rich_text(r) assert_equal 141, scorer.score.round end end -- 2.39.5