Enable specifying explicit list of external posts to display

alshedivat · alshedivat · commit 81082a5bee84 · 2024-01-10T00:20:08.000-05:00
diff --git a/_plugins/external-posts.rb b/_plugins/external-posts.rb
@@ -1,6 +1,8 @@
 require 'feedjira'
 require 'httparty'
 require 'jekyll'
+require 'nokogiri'
+require 'time'
 
 module ExternalPosts
   class ExternalPostsGenerator < Jekyll::Generator
@@ -10,27 +12,84 @@ class ExternalPostsGenerator < Jekyll::Generator
     def generate(site)
       if site.config['external_sources'] != nil
         site.config['external_sources'].each do |src|
-          p "Fetching external posts from #{src['name']}:"
-          xml = HTTParty.get(src['rss_url']).body
-          feed = Feedjira.parse(xml)
-          feed.entries.each do |e|
-            p "...fetching #{e.url}"
-            slug = e.title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
-            path = site.in_source_dir("_posts/#{slug}.md")
-            doc = Jekyll::Document.new(
-              path, { :site => site, :collection => site.collections['posts'] }
-            )
-            doc.data['external_source'] = src['name'];
-            doc.data['feed_content'] = e.content;
-            doc.data['title'] = "#{e.title}";
-            doc.data['description'] = e.summary;
-            doc.data['date'] = e.published;
-            doc.data['redirect'] = e.url;
-            site.collections['posts'].docs << doc
+          puts "Fetching external posts from #{src['name']}:"
+          if src['rss_url']
+            fetch_from_rss(site, src)
+          elsif src['posts']
+            fetch_from_urls(site, src)
           end
         end
       end
     end
-  end
 
+    def fetch_from_rss(site, src)
+      xml = HTTParty.get(src['rss_url']).body
+      feed = Feedjira.parse(xml)
+      process_entries(site, src, feed.entries)
+    end
+
+    def process_entries(site, src, entries)
+      entries.each do |e|
+        puts "...fetching #{e.url}"
+        create_document(site, src['name'], e.url, {
+          title: e.title,
+          content: e.content,
+          summary: e.summary,
+          published: e.published
+        })
+      end
+    end
+
+    def create_document(site, source_name, url, content)
+      slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
+      path = site.in_source_dir("_posts/#{slug}.md")
+      doc = Jekyll::Document.new(
+        path, { :site => site, :collection => site.collections['posts'] }
+      )
+      doc.data['external_source'] = source_name
+      doc.data['title'] = content[:title]
+      doc.data['feed_content'] = content[:content]
+      doc.data['description'] = content[:summary]
+      doc.data['date'] = content[:published]
+      doc.data['redirect'] = url
+      site.collections['posts'].docs << doc
+    end
+
+    def fetch_from_urls(site, src)
+      src['posts'].each do |post|
+        puts "...fetching #{post['url']}"
+        content = fetch_content_from_url(post['url'])
+        content[:published] = parse_published_date(post['published_date'])
+        create_document(site, src['name'], post['url'], content)
+      end
+    end
+
+    def parse_published_date(published_date)
+      case published_date
+      when String
+        Time.parse(published_date).utc
+      when Date
+        published_date.to_time.utc
+      else
+        raise "Invalid date format for #{published_date}"
+      end
+    end
+
+    def fetch_content_from_url(url)
+      html = HTTParty.get(url).body
+      parsed_html = Nokogiri::HTML(html)
+
+      title = parsed_html.at('head title')&.text || ''
+      description = parsed_html.at('head meta[name="description"]')&.attr('content') || ''
+      body_content = parsed_html.at('body')&.inner_html || ''
+
+      {
+        title: title,
+        content: body_content,
+        summary: description
+        # Note: The published date is now added in the fetch_from_urls method.
+      }
+    end
+
+  end
 end