I have a working scrapy project that downloads tsv files and saves them to s3.
I use a custom pipeline to save the original file names with dates.
I am wondering if it is possible to convert the tsv files to parquet before uploading them to s3. If so how would I do this in scrapy?
I should note that I am able to convert the files locally (last code block) but would like to do it inline before they are uploaded to s3.
This is what I have currently working….
##items
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
original_file_name = scrapy.Field()
date = scrapy.Field()
##pipeline to save original file names with dates
class OriginalNameFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
test = request
file_name_xml = request.url.split("=")[-1]
file_name: str = file_name_xml.removesuffix('.tsv') + '_' + datetime.today().strftime("%Y%m%d") + '.' + file_name_xml.split(".")[-1]
return file_name
##in my scraper
def parse_all_items(self, response):
all_urls = [bunch or urls]
for url in all_urls:
item = DownfilesItem()
item['file_urls'] = [url]
item['original_file_name'] = url.split("=")[-1]
yield item
##converting tsv to parquet locally
parse_options = csv.ParseOptions(delimiter="t")
for name in os.listdir(src_dir):
localpath = os.path.join(src_dir, name)
print(localpath)
if ".tsv" in localpath:
table = csv.read_csv(localpath, parse_options=parse_options)
pq.write_table(table, localpath.replace('tsv', 'parquet'))