@@ -74,34 +74,30 @@ async def acrawl(self):
74
74
75
75
# google search
76
76
for keyword in keywords :
77
+ search_urls = []
77
78
try :
78
79
await page .goto ("https://www.google.com" , wait_until = "load" )
79
80
await page .fill ('textarea[name="q"]' , keyword )
80
81
await page .press ('textarea[name="q"]' , "Enter" )
81
- for _ in range (30 ):
82
- # Check for a popup window and close it
83
- if len (self .browser .pages ) > 1 :
84
- await self .browser .pages [1 ].close ()
85
- # Scroll to the bottom of the page
86
- await page .mouse .wheel (0 , 1000 )
87
- await asyncio .sleep (0.25 )
88
- elements = await page .query_selector_all (
89
- "//div[starts-with(@class, 'g ')]//span/a[@href]"
90
- )
91
- if len (elements ) > 100 :
92
- break
93
- result_urls = [
94
- await link .get_attribute ("href" ) for link in elements
95
- ]
82
+ # pagination
83
+ for _ in range (10 ):
84
+ await page .wait_for_load_state ("load" )
85
+ # parse urls
86
+ elements = await page .locator (
87
+ "xpath=//div[starts-with(@class, 'g ')]//span/a[@href]"
88
+ ).all ()
89
+ page_urls = [await e .get_attribute ("href" ) for e in elements ]
90
+ search_urls .extend (page_urls )
91
+ # click the "Next" button
92
+ await page .locator ("xpath=//td[@role='heading']" ).last .click ()
96
93
logger .info (
97
- f"google_search() { keyword = } GOT { len (result_urls )} results"
94
+ f"google_search() { keyword = } GOT { len (search_urls )} results"
98
95
)
99
96
except Exception as ex :
100
- result_urls = []
101
97
logger .warning (f"google_search() { type (ex ).__name__ } : { ex } " )
102
98
103
99
# browse urls in parallel
104
- tasks = [asyncio .create_task (self .abrowse (url )) for url in result_urls ]
100
+ tasks = [asyncio .create_task (self .abrowse (url )) for url in search_urls ]
105
101
await asyncio .gather (* tasks )
106
102
107
103
def crawl (self ):
0 commit comments