So, I am trying to port my script from requests
to httpx
as the one with the requests
is failing on websites protected by Cloudflare and despite my best efforts, I am not able to fix it. Besides, for fun, I wanted to test if the async will improve the speed.
So, what my script does is scan an entire WordPress website for broken links and add them in a CSV file.
I was using ThreadPoolExecutor in this to check all links found in a post in one go rather than looping through it.
Now, I want to do the same thing using Asyncio/Httpx.
So, I tried it like this:
import httpx
import asyncio
import bs4
import sys
import csv
import concurrent.futures
from concurrent.futures import as_completed
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en-GB;q=0.9,en;q=0.8',
'cache-control': 'max-age=0',
'dnt': '1',
'priority': 'u=0, i',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15 Edg/126.0.0.0'
}
async def threadWorkAsync(link, headers, client):
print("t Content Link:", link[0], end=" ")
try:
link_response= await client.head(link[0], headers=headers)
print(link_response.status_code)
except (
httpx.HTTPStatusError,
httpx.HTTPError,
httpx.ConnectError,
httpx.InvalidURL,
httpx.ConnectTimeout,
httpx.RequestError,
) as errh:
print("{errh} in URL, ", link)
def asyncThreadRunner(link, headers, client):
# Create run loop for this thread and block until completion
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(threadWorkAsync(link, headers, client))
def executeBrokenLinkCheck(links, client):
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(asyncThreadRunner, link, headers, client) for link in links]
return [future.result() for future in as_completed(futures)]
def get_pages(domain):
pages = int( httpx.get("https://" + domain + "/wp-json/wp/v2/posts", headers=headers).headers["X-WP-TotalPages"] )
return pages
def getLinks(rendered_content):
soup = bs4.BeautifulSoup(rendered_content, "html.parser")
return [(link["href"], link.text) for link in soup("a") if "href" in link.attrs]
async def fetch_posts(url):
async with httpx.AsyncClient() as client:
response = await client.get(url, headers=headers)
if response.status_code != 200:
print(f"Error: {response.status_code}")
return
data = response.json()
for post in data:
post_link = post['link']
print("Post Link:", post_link, end=" ")
# Check the status of the post link
link_response = await client.head(post_link, headers=headers)
print(link_response.status_code)
# Get links from post content
links = getLinks(post['content']['rendered'])
executeBrokenLinkCheck(links, client)
async def main():
base_url = sys.argv[1]
pages = get_pages(base_url)
print(pages)
# Iterate through 10 pages, assuming 10 posts per page
for page in range(1, pages+1):
url = f"https://{base_url}/wp-json/wp/v2/posts?page={page}"
print(url)
print(f"Fetching posts from page {page}...")
await fetch_posts(url)
asyncio.run(main())
But it giving this error…
File "C:pythonlibasynciobase_events.py", line 515, in _check_closed
raise RuntimeError('Event loop is closed')
RuntimeError: Event loop is closed
After I looked it up then found out that it can be prevented by using this:
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
Now, after adding this line in the asyncThreadRunner
, the script pauses in between and never resumes. Even pressing Ctrl+C doesn’t quit it. I have to manually end it from the TasK Manager.
How do I fix this?