Trying to solve https://leetcode.com/problems/web-crawler-multithreaded/
This code works (well at least for the toy test cases before eventually TLE’ing)
from collections import deque
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor
from threading import Lock, Thread
import queue
import time
class Solution:
def __init__(self):
self.visited = set()
self.frontier = queue.Queue()
self.visitLock = Lock()
def threadCrawler(self, htmlParser):
while True:
nextUrl = self.frontier.get()
urls = htmlParser.getUrls(nextUrl)
with self.visitLock:
self.visited.add(nextUrl)
host = urlparse(nextUrl).hostname
urls = list(filter(lambda x: urlparse(x).hostname == host, urls))
with self.visitLock:
urls = list(filter(lambda x: x not in self.visited, urls))
for url in urls:
self.frontier.put(url)
self.frontier.task_done()
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
self.frontier.put(startUrl)
n = 10
for i in range(n):
Thread(target=self.threadCrawler, args=(htmlParser,), daemon=True).start()
self.frontier.join()
return self.visited
But this code which uses a ThreadPoolExecutor doesn’t work – it times out in the toy examples even with a single thread.
from collections import deque
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor
from threading import Lock, Thread
import queue
import time
class Solution:
def __init__(self):
self.visited = set()
self.frontier = queue.Queue()
self.visitLock = Lock()
def threadCrawler(self, htmlParser):
while True:
nextUrl = self.frontier.get()
urls = htmlParser.getUrls(nextUrl)
with self.visitLock:
self.visited.add(nextUrl)
host = urlparse(nextUrl).hostname
urls = list(filter(lambda x: urlparse(x).hostname == host, urls))
with self.visitLock:
urls = list(filter(lambda x: x not in self.visited, urls))
for url in urls:
self.frontier.put(url)
self.frontier.task_done()
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
self.frontier.put(startUrl)
n = 1
executor = ThreadPoolExecutor(max_workers=n)
for i in range(n):
executor.submit(self.threadCrawler,htmlParser, daemon=True)
self.frontier.join()
return self.visited