LinkDetailsExtractor adjustments (#31357)
This commit is contained in:
		
							parent
							
								
									68c7782940
								
							
						
					
					
						commit
						0518613dd7
					
				| @ -157,7 +157,7 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def title |   def title | ||||||
|     html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip |     html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def description |   def description | ||||||
| @ -205,11 +205,11 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def language |   def language | ||||||
|     valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang')) |     valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang')) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def icon |   def icon | ||||||
|     valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon')) |     valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon')) | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   private |   private | ||||||
| @ -237,18 +237,20 @@ class LinkDetailsExtractor | |||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def link_tag(name) |   def link_tag(name) | ||||||
|     document.xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler).pick('href') |     head.at_xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler)&.attr('href') | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def opengraph_tag(name) |   def opengraph_tag(name) | ||||||
|     document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content') |     head.at_xpath("//meta[nokogiri:casecmp(@property, '#{name}') or nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content') | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def meta_tag(name) |   def meta_tag(name) | ||||||
|     document.xpath("//meta[@name=\"#{name}\"]").pick('content') |     head.at_xpath("//meta[nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content') | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def structured_data |   def structured_data | ||||||
|  |     return @structured_data if defined?(@structured_data) | ||||||
|  | 
 | ||||||
|     # Some publications have more than one JSON-LD definition on the page, |     # Some publications have more than one JSON-LD definition on the page, | ||||||
|     # and some of those definitions aren't valid JSON either, so we have |     # and some of those definitions aren't valid JSON either, so we have | ||||||
|     # to loop through here until we find something that is the right type |     # to loop through here until we find something that is the right type | ||||||
| @ -273,6 +275,10 @@ class LinkDetailsExtractor | |||||||
|     @document ||= detect_encoding_and_parse_document |     @document ||= detect_encoding_and_parse_document | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|  |   def head | ||||||
|  |     @head ||= document.at_xpath('/html/head') | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|   def detect_encoding_and_parse_document |   def detect_encoding_and_parse_document | ||||||
|     html = nil |     html = nil | ||||||
|     encoding = nil |     encoding = nil | ||||||
|  | |||||||
| @ -8,5 +8,9 @@ class NokogiriHandler | |||||||
|     def link_rel_include(token_list, token) |     def link_rel_include(token_list, token) | ||||||
|       token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase) |       token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase) | ||||||
|     end |     end | ||||||
|  | 
 | ||||||
|  |     def casecmp(str1, str2) | ||||||
|  |       str1.to_s.casecmp?(str2.to_s) | ||||||
|  |     end | ||||||
|   end |   end | ||||||
| end | end | ||||||
|  | |||||||
| @ -49,7 +49,8 @@ RSpec.describe LinkDetailsExtractor do | |||||||
|       <html lang="en"> |       <html lang="en"> | ||||||
|       <head> |       <head> | ||||||
|         <title>Man bites dog</title> |         <title>Man bites dog</title> | ||||||
|         <meta name="description" content="A dog's tale"> |         <meta name="descripTION" content="A dog's tale"> | ||||||
|  |         <link rel="pretty IcoN" href="/favicon.ico"> | ||||||
|       </head> |       </head> | ||||||
|       </html> |       </html> | ||||||
|     HTML |     HTML | ||||||
| @ -59,7 +60,8 @@ RSpec.describe LinkDetailsExtractor do | |||||||
|         .to have_attributes( |         .to have_attributes( | ||||||
|           title: eq('Man bites dog'), |           title: eq('Man bites dog'), | ||||||
|           description: eq("A dog's tale"), |           description: eq("A dog's tale"), | ||||||
|           language: eq('en') |           language: eq('en'), | ||||||
|  |           icon: eq('https://example.com/favicon.ico') | ||||||
|         ) |         ) | ||||||
|     end |     end | ||||||
|   end |   end | ||||||
| @ -256,7 +258,7 @@ RSpec.describe LinkDetailsExtractor do | |||||||
|       <head> |       <head> | ||||||
|         <meta property="og:url" content="https://example.com/dog.html"> |         <meta property="og:url" content="https://example.com/dog.html"> | ||||||
|         <meta property="og:title" content="Man bites dog"> |         <meta property="og:title" content="Man bites dog"> | ||||||
|         <meta property="og:description" content="A dog's tale"> |         <meta property="OG:description" content="A dog's tale"> | ||||||
|         <meta property="article:published_time" content="2022-01-31T19:53:00+00:00"> |         <meta property="article:published_time" content="2022-01-31T19:53:00+00:00"> | ||||||
|         <meta property="og:author" content="Charlie Brown"> |         <meta property="og:author" content="Charlie Brown"> | ||||||
|         <meta property="og:locale" content="en"> |         <meta property="og:locale" content="en"> | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user