Change fetch link card service to parse as HTML5 (#31814)
This commit is contained in:
		
							parent
							
								
									9d9901cc5b
								
							
						
					
					
						commit
						a0ea2fa3b0
					
				| @ -157,11 +157,11 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def title |   def title | ||||||
|     html_entities_decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip |     html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def description |   def description | ||||||
|     html_entities_decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description')) |     html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description')) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def published_at |   def published_at | ||||||
| @ -181,7 +181,7 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def provider_name |   def provider_name | ||||||
|     html_entities_decode(structured_data&.publisher_name || opengraph_tag('og:site_name')) |     html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name')) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def provider_url |   def provider_url | ||||||
| @ -189,7 +189,7 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def author_name |   def author_name | ||||||
|     html_entities_decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username')) |     html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username')) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def author_url |   def author_url | ||||||
| @ -258,7 +258,7 @@ class LinkDetailsExtractor | |||||||
| 
 | 
 | ||||||
|       next if json_ld.blank? |       next if json_ld.blank? | ||||||
| 
 | 
 | ||||||
|       structured_data = StructuredData.new(html_entities_decode(json_ld)) |       structured_data = StructuredData.new(html_entities.decode(json_ld)) | ||||||
| 
 | 
 | ||||||
|       next unless structured_data.valid? |       next unless structured_data.valid? | ||||||
| 
 | 
 | ||||||
| @ -274,11 +274,20 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def detect_encoding_and_parse_document |   def detect_encoding_and_parse_document | ||||||
|     [detect_encoding, nil, header_encoding].uniq.each do |encoding| |     html = nil | ||||||
|       document = Nokogiri::HTML(@html, nil, encoding) |     encoding = nil | ||||||
|       return document if document.to_s.valid_encoding? | 
 | ||||||
|  |     [detect_encoding, header_encoding].compact.each do |enc| | ||||||
|  |       html = @html.dup.force_encoding(enc) | ||||||
|  |       if html.valid_encoding? | ||||||
|  |         encoding = enc | ||||||
|  |         break | ||||||
|       end |       end | ||||||
|     Nokogiri::HTML(@html, nil, 'UTF-8') |     end | ||||||
|  | 
 | ||||||
|  |     html = @html unless encoding | ||||||
|  | 
 | ||||||
|  |     Nokogiri::HTML5(html, nil, encoding) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def detect_encoding |   def detect_encoding | ||||||
| @ -299,15 +308,6 @@ class LinkDetailsExtractor | |||||||
|     end |     end | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def html_entities_decode(string) |  | ||||||
|     return if string.nil? |  | ||||||
| 
 |  | ||||||
|     unicode_string = string.to_s.encode('UTF-8') |  | ||||||
|     raise EncodingError, 'cannot convert string to valid UTF-8' unless unicode_string.valid_encoding? |  | ||||||
| 
 |  | ||||||
|     html_entities.decode(unicode_string) |  | ||||||
|   end |  | ||||||
| 
 |  | ||||||
|   def html_entities |   def html_entities | ||||||
|     @html_entities ||= HTMLEntities.new(:expanded) |     @html_entities ||= HTMLEntities.new(:expanded) | ||||||
|   end |   end | ||||||
|  | |||||||
| @ -29,7 +29,7 @@ class FetchLinkCardService < BaseService | |||||||
|     end |     end | ||||||
| 
 | 
 | ||||||
|     attach_card if @card&.persisted? |     attach_card if @card&.persisted? | ||||||
|   rescue HTTP::Error, OpenSSL::SSL::SSLError, Addressable::URI::InvalidURIError, Mastodon::HostValidationError, Mastodon::LengthValidationError, EncodingError, ActiveRecord::RecordInvalid => e |   rescue HTTP::Error, OpenSSL::SSL::SSLError, Addressable::URI::InvalidURIError, Mastodon::HostValidationError, Mastodon::LengthValidationError, Encoding::UndefinedConversionError, ActiveRecord::RecordInvalid => e | ||||||
|     Rails.logger.debug { "Error fetching link #{@original_url}: #{e}" } |     Rails.logger.debug { "Error fetching link #{@original_url}: #{e}" } | ||||||
|     nil |     nil | ||||||
|   end |   end | ||||||
| @ -80,7 +80,7 @@ class FetchLinkCardService < BaseService | |||||||
|     urls = if @status.local? |     urls = if @status.local? | ||||||
|              @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[1]).normalize } |              @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[1]).normalize } | ||||||
|            else |            else | ||||||
|              document = Nokogiri::HTML(@status.text) |              document = Nokogiri::HTML5(@status.text) | ||||||
|              links = document.css('a') |              links = document.css('a') | ||||||
| 
 | 
 | ||||||
|              links.filter_map { |a| Addressable::URI.parse(a['href']) unless skip_link?(a) }.filter_map(&:normalize) |              links.filter_map { |a| Addressable::URI.parse(a['href']) unless skip_link?(a) }.filter_map(&:normalize) | ||||||
|  | |||||||
| @ -192,8 +192,8 @@ RSpec.describe FetchLinkCardService do | |||||||
|         context 'when encoding problems appear in title tag' do |         context 'when encoding problems appear in title tag' do | ||||||
|           let(:status) { Fabricate(:status, text: 'Check out http://example.com/latin1_posing_as_utf8_broken') } |           let(:status) { Fabricate(:status, text: 'Check out http://example.com/latin1_posing_as_utf8_broken') } | ||||||
| 
 | 
 | ||||||
|           it 'does not create a preview card' do |           it 'creates a preview card anyway that replaces invalid bytes with U+FFFD (replacement char)' do | ||||||
|             expect(status.preview_card).to be_nil |             expect(status.preview_card.title).to eq("Tofu <20> l'orange") | ||||||
|           end |           end | ||||||
|         end |         end | ||||||
|       end |       end | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user