I am scraping large websites, some with hundreds of pages and lots of content. For smaller websites, my scraping functionality works no problem, but for larger ones, I continue to get the error:
[2024-07-29 23:42:24 +0000] [7] [ERROR] Worker (pid:8) was sent SIGKILL! Perhaps out of memory?
This is my scraping code:
def scrape_website(start_url, visited=None, base_url=None):
if visited is None:
visited = set()
if base_url is None:
base_url = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(start_url))
queue = Queue()
queue.put(start_url)
text_content = []
while not queue.empty():
url = queue.get()
if url in visited:
continue
visited.add(url)
print(f"Scraping: {url}")
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
for script_or_style in soup(["script", "style", "head"]):
script_or_style.extract()
tags_of_interest = [
'a', 'abbr', 'address', 'area', 'article', 'aside', 'audio', 'b', 'base', 'bdi', 'bdo', 'blockquote',
'body', 'br', 'button', 'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data', 'datalist',
'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt', 'em', 'embed', 'fieldset', 'figcaption',
'figure', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'html', 'i', 'iframe',
'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'link', 'main', 'map', 'mark', 'meta', 'meter',
'nav', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p', 'param', 'picture', 'pre',
'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'section', 'select', 'slot', 'small',
'source', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'template', 'textarea',
'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'u', 'ul', 'var'
]
for tag in soup.find_all(tags_of_interest):
text = tag.get_text(separator=' ', strip=True)
element_url = url + find_closest_id(tag)
if text:
text_content.append((text, element_url))
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith(('mailto:', 'tel:', '#')):
continue
next_page = urljoin(base_url, urlparse(href).path)
if next_page not in visited:
queue.put(next_page)
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
return text_content
I am hoping to get some advice on ways to improve my scraping methods / reduce the amount of memory is being used during the scraping process. Just an FYI, I am using a digital ocean droplet with 2 vCPU's
and 2GB of Memory
… Thanks in advance!