1
1
require 'feedjira'
2
2
require 'httparty'
3
3
require 'jekyll'
4
+ require 'nokogiri'
5
+ require 'time'
4
6
5
7
module ExternalPosts
6
8
class ExternalPostsGenerator < Jekyll ::Generator
@@ -10,27 +12,84 @@ class ExternalPostsGenerator < Jekyll::Generator
10
12
def generate ( site )
11
13
if site . config [ 'external_sources' ] != nil
12
14
site . config [ 'external_sources' ] . each do |src |
13
- p "Fetching external posts from #{ src [ 'name' ] } :"
14
- xml = HTTParty . get ( src [ 'rss_url' ] ) . body
15
- feed = Feedjira . parse ( xml )
16
- feed . entries . each do |e |
17
- p "...fetching #{ e . url } "
18
- slug = e . title . downcase . strip . gsub ( ' ' , '-' ) . gsub ( /[^\w -]/ , '' )
19
- path = site . in_source_dir ( "_posts/#{ slug } .md" )
20
- doc = Jekyll ::Document . new (
21
- path , { :site => site , :collection => site . collections [ 'posts' ] }
22
- )
23
- doc . data [ 'external_source' ] = src [ 'name' ] ;
24
- doc . data [ 'feed_content' ] = e . content ;
25
- doc . data [ 'title' ] = "#{ e . title } " ;
26
- doc . data [ 'description' ] = e . summary ;
27
- doc . data [ 'date' ] = e . published ;
28
- doc . data [ 'redirect' ] = e . url ;
29
- site . collections [ 'posts' ] . docs << doc
15
+ puts "Fetching external posts from #{ src [ 'name' ] } :"
16
+ if src [ 'rss_url' ]
17
+ fetch_from_rss ( site , src )
18
+ elsif src [ 'posts' ]
19
+ fetch_from_urls ( site , src )
30
20
end
31
21
end
32
22
end
33
23
end
34
- end
35
24
25
+ def fetch_from_rss ( site , src )
26
+ xml = HTTParty . get ( src [ 'rss_url' ] ) . body
27
+ feed = Feedjira . parse ( xml )
28
+ process_entries ( site , src , feed . entries )
29
+ end
30
+
31
+ def process_entries ( site , src , entries )
32
+ entries . each do |e |
33
+ puts "...fetching #{ e . url } "
34
+ create_document ( site , src [ 'name' ] , e . url , {
35
+ title : e . title ,
36
+ content : e . content ,
37
+ summary : e . summary ,
38
+ published : e . published
39
+ } )
40
+ end
41
+ end
42
+
43
+ def create_document ( site , source_name , url , content )
44
+ slug = content [ :title ] . downcase . strip . gsub ( ' ' , '-' ) . gsub ( /[^\w -]/ , '' )
45
+ path = site . in_source_dir ( "_posts/#{ slug } .md" )
46
+ doc = Jekyll ::Document . new (
47
+ path , { :site => site , :collection => site . collections [ 'posts' ] }
48
+ )
49
+ doc . data [ 'external_source' ] = source_name
50
+ doc . data [ 'title' ] = content [ :title ]
51
+ doc . data [ 'feed_content' ] = content [ :content ]
52
+ doc . data [ 'description' ] = content [ :summary ]
53
+ doc . data [ 'date' ] = content [ :published ]
54
+ doc . data [ 'redirect' ] = url
55
+ site . collections [ 'posts' ] . docs << doc
56
+ end
57
+
58
+ def fetch_from_urls ( site , src )
59
+ src [ 'posts' ] . each do |post |
60
+ puts "...fetching #{ post [ 'url' ] } "
61
+ content = fetch_content_from_url ( post [ 'url' ] )
62
+ content [ :published ] = parse_published_date ( post [ 'published_date' ] )
63
+ create_document ( site , src [ 'name' ] , post [ 'url' ] , content )
64
+ end
65
+ end
66
+
67
+ def parse_published_date ( published_date )
68
+ case published_date
69
+ when String
70
+ Time . parse ( published_date ) . utc
71
+ when Date
72
+ published_date . to_time . utc
73
+ else
74
+ raise "Invalid date format for #{ published_date } "
75
+ end
76
+ end
77
+
78
+ def fetch_content_from_url ( url )
79
+ html = HTTParty . get ( url ) . body
80
+ parsed_html = Nokogiri ::HTML ( html )
81
+
82
+ title = parsed_html . at ( 'head title' ) &.text || ''
83
+ description = parsed_html . at ( 'head meta[name="description"]' ) &.attr ( 'content' ) || ''
84
+ body_content = parsed_html . at ( 'body' ) &.inner_html || ''
85
+
86
+ {
87
+ title : title ,
88
+ content : body_content ,
89
+ summary : description
90
+ # Note: The published date is now added in the fetch_from_urls method.
91
+ }
92
+ end
93
+
94
+ end
36
95
end
0 commit comments