it’s my first time using VDB and I wrote some basic scraping and embedding code for the test. I can’t understand what’s wrong with API, but I keep getting the same error. Here’s my code and the output. Additionally, if you could recommend VDB for newcomers it would be great. Thanks in advance.
import os
import requests
from bs4 import BeautifulSoup
import re
from fake_useragent import UserAgent
import openai
import pinecone
# Initialize Pinecone
pc = Pinecone(api_key='')
index_name = "web"
# Check if the index exists, otherwise create one
if index_name not in pc.list_indexes().names():
pinecone.create_index(name=index_name, dimension=1536) # Ensure dimension matches your embedding dimension
index = pinecone.Index(index_name, host='https://web-mcilcxv.svc.aped-4627-b74a.pinecone.io')
# Securely fetch your OpenAI API key
openai.api_key = ''
# List of website URLs
website_urls = [
"https://www.accel.com", "https://www.a16z.com", "https://www.greylock.com",
"https://www.benchmark.com", "https://www.sequoiacap.com", "https://www.indexventures.com",
"https://www.kpcb.com", "https://www.lsvp.com", "https://www.matrixpartners.com",
"https://www.500.co", "https://www.sparkcapital.com", "https://www.insightpartners.com"
]
def scrape_home_page_and_save_text(url):
try:
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
text_content = re.sub(r's+', ' ', soup.get_text()).strip()
print(f"Scraped {url} successfully.")
return text_content
else:
print(f"Failed to scrape {url}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"An error occurred while scraping {url}: {str(e)}")
return None
def embed_text_content(text_content):
try:
response = openai.Embedding.create(input=text_content, model="text-embedding-ada-002")
return response['data'][0]['embedding'] # Accessing embedding data correctly
except openai.Error as e: # Catching OpenAI specific errors
print(f"An error occurred while generating embedding: {str(e)}")
return None
# Scrape, embed, and store data
embeddings = []
ids = []
for url in website_urls:
print(f"Scraping {url}...")
text_content = scrape_home_page_and_save_text(url)
if text_content:
print(f"Embedding content from {url}...")
embedding = embed_text_content(text_content)
if embedding:
embeddings.append(list(embedding))
ids.append(url)
# Batch upsert to Pinecone
if embeddings and ids:
index.upsert(vectors=list(zip(ids, embeddings)))
print(f"Data for all sites uploaded to Pinecone.")
print("Scraping, embedding, and uploading finished.")
Reason: Forbidden HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 01 May 2024 10:37:34 GMT', 'Content-Type': 'text/plain', 'Content-Length': '9', 'Connection': 'keep-alive', 'x-pinecone-auth-rejected-reason': 'Wrong API key', 'www-authenticate': 'Wrong API key', 'server': 'envoy'}) HTTP response body: Forbidden
I double checked the api and tried with env variables.