I am trying to build an API using flask which will extract the text from a url given and generate valid tags for that text. Say for example, the text is for a recipe of chicken curry, valid tags can be recipe, Indian Cuisine, food etc.
I have tried nltk library, TF-IDF vectorizer etc, but all of them are analysing maximum frequency of the words, not generating new words.
Does any one have any solution for this problem.
I have also tried using gtp2 module, but the output is not what I am expecting
import requests
from bs4 import BeautifulSoup
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from flask import Flask, request, jsonify
app = Flask(__name__)
# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
def fetch_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses
soup = BeautifulSoup(response.content, 'html.parser')
# Find all paragraphs and concatenate their text
paragraphs = soup.find_all('p')
text = ' '.join([para.get_text() for para in paragraphs])
return text
except requests.exceptions.RequestException as e:
print(f"Error fetching content from {url}: {str(e)}")
return None
except Exception as e:
print(f"Error parsing content from {url}: {str(e)}")
return None
def generate_tags(text, max_length=20, num_return_sequences=3):
try:
# Create a prompt to generate tags
prompt = "Tags for this text: "
# Tokenize the input text and prompt
input_ids = tokenizer.encode(prompt + text, return_tensors='pt')
# Generate tags using GPT-2 model
output = model.generate(
input_ids,
max_length=max_length + len(input_ids[0]),
num_return_sequences=num_return_sequences,
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
# Decode generated tags
tags = []
for seq in output:
decoded_seq = tokenizer.decode(seq, skip_special_tokens=True).strip()
# Extract tags from the generated text
tags.extend([t.strip() for t in decoded_seq.split() if t.startswith('#')])
return tags
except Exception as e:
print(f"Error generating tags: {str(e)}")
return None
@app.route('/generate_tags', methods=['POST'])
def generate_tags_api():
data = request.get_json()
url = data.get('url')
if not url:
return jsonify({'error': 'URL is required'}), 400
try:
text = fetch_text_from_url(url)
if not text:
return jsonify({'error': 'Failed to fetch content from URL'}), 500
tags = generate_tags(text)
if tags:
return jsonify({'tags': tags})
else:
return jsonify({'error': 'Failed to generate tags from URL'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == "__main__":
app.run(port=8000, debug=True)
Noobmaster69 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.