]> git.openstreetmap.org Git - rails.git/blob - lib/rich_text.rb
No need as that's the default behaviour
[rails.git] / lib / rich_text.rb
1 # frozen_string_literal: true
2
3 module RichText
4   SPAMMY_PHRASES = [
5     "Business Description:", "Additional Keywords:"
6   ].freeze
7
8   DESCRIPTION_MAX_LENGTH = 500
9   DESCRIPTION_WORD_BREAK_THRESHOLD_LENGTH = 450
10
11   def self.new(format, text)
12     case format
13     when "html" then HTML.new(text || "")
14     when "markdown" then Markdown.new(text || "")
15     when "text" then Text.new(text || "")
16     end
17   end
18
19   class SimpleFormat
20     include ActionView::Helpers::TextHelper
21     include ActionView::Helpers::OutputSafetyHelper
22
23     def sanitize(text, _options = {})
24       Sanitize.clean(text, Sanitize::Config::OSM).html_safe
25     end
26   end
27
28   class Base < String
29     def spam_score
30       link_count = 0
31       link_size = 0
32
33       doc = Nokogiri::HTML(to_html)
34
35       if doc.content.empty?
36         link_proportion = 0
37       else
38         doc.xpath("//a").each do |link|
39           link_count += 1
40           link_size += link.content.length
41         end
42
43         link_proportion = link_size.to_f / doc.content.length
44       end
45
46       spammy_phrases = SPAMMY_PHRASES.count do |phrase|
47         doc.content.include?(phrase)
48       end
49
50       ([link_proportion - 0.2, 0.0].max * 200) +
51         (link_count * 40) +
52         (spammy_phrases * 40)
53     end
54
55     def image
56       nil
57     end
58
59     def image_alt
60       nil
61     end
62
63     def description
64       nil
65     end
66
67     def truncate_html(max_length = nil, img_length = 1000)
68       html_doc = to_html
69       return html_doc if max_length.nil?
70
71       doc = Nokogiri::HTML::DocumentFragment.parse(html_doc)
72       keep_or_discards = %w[p h1 h2 h3 h4 h5 h6 pre a table ul ol dl]
73       accumulated_length = 0
74       exceeded_node_parent = nil
75       truncated = false
76
77       doc.traverse do |node|
78         if accumulated_length >= max_length
79           if node == exceeded_node_parent
80             exceeded_node_parent = node.parent
81             node.remove if keep_or_discards.include?(node.name)
82           else
83             node.remove
84           end
85           next
86         end
87
88         next unless node.children.empty?
89
90         if node.text?
91           accumulated_length += node.text.length
92         elsif node.name == "img"
93           accumulated_length += img_length
94         end
95
96         if accumulated_length >= max_length
97           truncated = true
98           exceeded_node_parent = node.parent
99           node.remove
100         end
101       end
102
103       {
104         :truncated => truncated,
105         :html => doc.to_html.html_safe
106       }
107     end
108
109     protected
110
111     def simple_format(text)
112       SimpleFormat.new.simple_format(text, :dir => "auto")
113     end
114
115     def sanitize(text)
116       Sanitize.clean(text, Sanitize::Config::OSM).html_safe
117     end
118
119     def linkify(text, mode = :urls)
120       link_attr = 'rel="nofollow noopener noreferrer" dir="auto"'
121       Rinku.auto_link(ERB::Util.html_escape(text), mode, link_attr) do |url|
122         url = shorten_host(url, Settings.linkify_hosts, Settings.linkify_hosts_replacement)
123         shorten_host(url, Settings.linkify_wiki_hosts, Settings.linkify_wiki_hosts_replacement) do |path|
124           path.sub(Regexp.new(Settings.linkify_wiki_optional_path_prefix || ""), "")
125         end
126       end.html_safe
127     end
128
129     private
130
131     def shorten_host(url, hosts, hosts_replacement)
132       %r{^(https?://([^/]*))(.*)$}.match(url) do |m|
133         scheme_host, host, path = m.captures
134         if hosts&.include?(host)
135           path = yield(path) if block_given?
136           if hosts_replacement
137             "#{hosts_replacement}#{path}"
138           else
139             "#{scheme_host}#{path}"
140           end
141         end || url
142       end || url
143     end
144   end
145
146   class HTML < Base
147     def to_html
148       linkify(simple_format(self))
149     end
150
151     def to_text
152       to_s
153     end
154   end
155
156   class Markdown < Base
157     def to_html
158       linkify(sanitize(document.to_html), :all)
159     end
160
161     def to_text
162       to_s
163     end
164
165     def image
166       @image_element = first_image_element(document.root) unless defined? @image_element
167       @image_element.attr["src"] if @image_element
168     end
169
170     def image_alt
171       @image_element = first_image_element(document.root) unless defined? @image_element
172       @image_element.attr["alt"] if @image_element
173     end
174
175     def description
176       return @description if defined? @description
177
178       @description = first_truncated_text_content(document.root)
179     end
180
181     private
182
183     def document
184       return @document if @document
185
186       @document = Kramdown::Document.new(self)
187
188       should_get_dir_auto = lambda do |el|
189         dir_auto_types = [:p, :header, :codespan, :codeblock, :pre, :ul, :ol, :table, :dl, :math]
190         return true if dir_auto_types.include?(el.type)
191         return true if el.type == :a && el.children.length == 1 && el.children[0].type == :text && el.children[0].value == el.attr["href"]
192
193         false
194       end
195
196       add_dir = lambda do |element|
197         element.attr["dir"] ||= "auto" if should_get_dir_auto.call(element)
198         element.children.each(&add_dir)
199       end
200       add_dir.call(@document.root)
201
202       @document
203     end
204
205     def first_image_element(element)
206       return element if image?(element) && element.attr["src"].present?
207
208       element.children.find do |child|
209         nested_image = first_image_element(child)
210         break nested_image if nested_image
211       end
212     end
213
214     def first_truncated_text_content(element)
215       if paragraph?(element)
216         truncated_text_content(element)
217       else
218         element.children.find do |child|
219           text = first_truncated_text_content(child)
220           break text unless text.nil?
221         end
222       end
223     end
224
225     def truncated_text_content(element)
226       text = +""
227
228       append_text = lambda do |child|
229         if child.type == :text
230           text << child.value
231         else
232           child.children.each do |c|
233             append_text.call(c)
234             break if text.length > DESCRIPTION_MAX_LENGTH
235           end
236         end
237       end
238       append_text.call(element)
239
240       return nil if text.blank?
241
242       text_truncated_to_word_break = text.truncate(DESCRIPTION_MAX_LENGTH, :separator => /(?<!\s)\s+/)
243
244       if text_truncated_to_word_break.length >= DESCRIPTION_WORD_BREAK_THRESHOLD_LENGTH
245         text_truncated_to_word_break
246       else
247         text.truncate(DESCRIPTION_MAX_LENGTH)
248       end
249     end
250
251     def image?(element)
252       element.type == :img || (element.type == :html_element && element.value == "img")
253     end
254
255     def paragraph?(element)
256       element.type == :p || (element.type == :html_element && element.value == "p")
257     end
258   end
259
260   class Text < Base
261     def to_html
262       linkify(simple_format(ERB::Util.html_escape(self)))
263     end
264
265     def to_text
266       to_s
267     end
268   end
269 end