scrapy-plugins · pawelmhm · Apr 3, 2015 · Apr 3, 2015 · Apr 3, 2015 · May 26, 2015
diff --git a/example/scrashtest/spiders/dmoz_two.py b/example/scrashtest/spiders/dmoz_two.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+from urlparse import urljoin
+import json
+
+import scrapy
+from scrapy.contrib.linkextractors import LinkExtractor
+
+
+class DmozSpider(scrapy.Spider):
+    name = "js_spider"
+    start_urls = ['http://www.isjavascriptenabled.com/']
+    splash = {'args': {'har': 1, 'html': 1}}
+
+    def parse(self, response):
+        is_js = response.xpath("//h1/text()").extract()
+        if "".join(is_js).lower() == "yes":
+            self.log("JS enabled!")
+        else:
+            self.log("Error! JS disabled!", scrapy.log.ERROR)
+        le = LinkExtractor()
+
+        for link in le.extract_links(response):
+            url = urljoin(response.url, link.url)
+            yield scrapy.Request(url, self.parse_link)
+            break
+
+    def parse_link(self, response):
+        title = response.xpath("//title").extract()
+        yes = response.xpath("//h1").extract()
+        self.log("response is: {}".format(repr(response)))
+        self.log(u"Html in response contains {} {}".format("".join(title), "".join(yes)))
diff --git a/scrapyjs/middleware.py b/scrapyjs/middleware.py
@@ -6,6 +6,7 @@
 from scrapy.exceptions import NotConfigured
 
 from scrapy import log
+from scrapy.http.response.html import HtmlResponse
 from scrapy.http.headers import Headers
 
 
@@ -32,6 +33,14 @@ def __init__(self, crawler, splash_base_url, slot_policy):
         self.splash_base_url = splash_base_url
         self.slot_policy = slot_policy
 
+    def get_splash_options(self, request, spider):
+        if request.meta.get("dont_proxy"):
+            return
+
+        spider_options = getattr(spider, "splash", {})
+        request_options = request.meta.get("splash")
+        return request_options or spider_options
+
     @classmethod
     def from_crawler(cls, crawler):
         splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url)
@@ -43,24 +52,26 @@ def from_crawler(cls, crawler):
         return cls(crawler, splash_base_url, slot_policy)
 
     def process_request(self, request, spider):
-        splash_options = request.meta.get('splash')
+        splash_options = self.get_splash_options(request, spider)
         if not splash_options:
             return
 
+        elif request.meta.get("_splash_processed"):
+            return
+
         if request.method != 'GET':
             log.msg("Currently only GET requests are supported by SplashMiddleware; %s "
                     "will be handled without Splash" % request, logging.WARNING)
             return request
 
         meta = request.meta
-        del meta['splash']
-        meta['_splash_processed'] = splash_options
 
         slot_policy = splash_options.get('slot_policy', self.slot_policy)
         self._set_download_slot(request, meta, slot_policy)
 
         args = splash_options.setdefault('args', {})
-        args.setdefault('url', request.url)
+        args['url'] = request.url
+
         body = json.dumps(args, ensure_ascii=False)
 
         if 'timeout' in args:
@@ -86,6 +97,7 @@ def process_request(self, request, spider):
         endpoint = splash_options.setdefault('endpoint', self.default_endpoint)
         splash_base_url = splash_options.get('splash_url', self.splash_base_url)
         splash_url = urljoin(splash_base_url, endpoint)
+        meta['_splash_processed'] = True
 
         req_rep = request.replace(
             url=splash_url,
@@ -96,20 +108,31 @@ def process_request(self, request, spider):
             # are not respected.
             headers=Headers({'Content-Type': 'application/json'}),
         )
-
         self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
         return req_rep
 
     def process_response(self, request, response, spider):
-        splash_options = request.meta.get("_splash_processed")
+        splash_options = self.get_splash_options(request, spider)
         if splash_options:
             endpoint = splash_options['endpoint']
             self.crawler.stats.inc_value(
                 'splash/%s/response_count/%s' % (endpoint, response.status)
             )
-
+            response = self.html_response(response, request)
         return response
 
+    def html_response(self, response, request):
+        """Give user nice HTML response he probably
+        expects.
+        """
+        data = json.loads(response.body)
+        html = data.get("html")
+        if not html:
+            return response
+
+        return HtmlResponse(data["url"], body=html, encoding='utf8',
+                            status=response.status, request=request)
+
     def _set_download_slot(self, request, meta, slot_policy):
         if slot_policy == SlotPolicy.PER_DOMAIN:
             # Use the same download slot to (sort of) respect download