I have a blob storage with pdf files, where i want to implement knowledge mining solution. I created the data source, the index, the skillset, and the indexer. However, when I run everything i get a warning that the “Could not execute skill because one or more skill input was invalid.”.
skillset code:
<code>def create_skillset(search_service_endpoint, search_service_api_key, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
# Define skills
doc_extraction_skill = DocumentExtractionSkill(
name="documentExtractionSkill",
description="Extract text from documents",
context="/document",
configuration={"imageAction": "generateNormalizedImagePerPage"},
inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
outputs=[OutputFieldMappingEntry(name="content", target_name="/documents/content")]
)
# Create skillset
skillset = SearchIndexerSkillset(
name=skillset_name,
skills=[doc_extraction_skill]
)
# Create skillset in Azure Cognitive Search
indexer_client.create_skillset(skillset)
print(f"Skillset '{skillset_name}' created successfully.")
</code>
<code>def create_skillset(search_service_endpoint, search_service_api_key, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
# Define skills
doc_extraction_skill = DocumentExtractionSkill(
name="documentExtractionSkill",
description="Extract text from documents",
context="/document",
configuration={"imageAction": "generateNormalizedImagePerPage"},
inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
outputs=[OutputFieldMappingEntry(name="content", target_name="/documents/content")]
)
# Create skillset
skillset = SearchIndexerSkillset(
name=skillset_name,
skills=[doc_extraction_skill]
)
# Create skillset in Azure Cognitive Search
indexer_client.create_skillset(skillset)
print(f"Skillset '{skillset_name}' created successfully.")
</code>
def create_skillset(search_service_endpoint, search_service_api_key, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
# Define skills
doc_extraction_skill = DocumentExtractionSkill(
name="documentExtractionSkill",
description="Extract text from documents",
context="/document",
configuration={"imageAction": "generateNormalizedImagePerPage"},
inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
outputs=[OutputFieldMappingEntry(name="content", target_name="/documents/content")]
)
# Create skillset
skillset = SearchIndexerSkillset(
name=skillset_name,
skills=[doc_extraction_skill]
)
# Create skillset in Azure Cognitive Search
indexer_client.create_skillset(skillset)
print(f"Skillset '{skillset_name}' created successfully.")
indexer:
<code># Function to create an indexer
def create_indexer(search_service_endpoint, search_service_api_key, indexer_name, data_source_name, index_name, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
field_mappings = [
FieldMapping(source_field_name="metadata_storage_path", target_field_name="metadata_storage_path"),
FieldMapping(source_field_name="metadata_storage_name", target_field_name="metadata_storage_name"),
FieldMapping(source_field_name="metadata_storage_last_modified", target_field_name="metadata_storage_last_modified"),
FieldMapping(source_field_name="metadata_content_type", target_field_name="metadata_content_type"),
]
output_field_mappings = [
FieldMapping(source_field_name="/document/content", target_field_name= "content"),
]
# Define indexing parameters
indexing_parameters = IndexingParameters(
configuration={
"indexStorageMetadataOnlyForOversizedDocuments": True,
"failOnUnsupportedContentType": False,
"indexedFileNameExtensions": ".pdf,.docx,.txt,.json",
"parseJson": True,
"parsingMode": "default",
"allowSkillsetToReadFileData": False
}
)
indexer = SearchIndexer(
name=indexer_name,
data_source_name=data_source_name,
target_index_name=index_name,
skillset_name=skillset_name,
field_mappings=field_mappings,
output_field_mappings=output_field_mappings,
schedule=IndexingSchedule(interval="PT15M"),
parameters=indexing_parameters
)
indexer_client.create_indexer(indexer)
print(f"Indexer '{indexer_name}' created.")
</code>
<code># Function to create an indexer
def create_indexer(search_service_endpoint, search_service_api_key, indexer_name, data_source_name, index_name, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
field_mappings = [
FieldMapping(source_field_name="metadata_storage_path", target_field_name="metadata_storage_path"),
FieldMapping(source_field_name="metadata_storage_name", target_field_name="metadata_storage_name"),
FieldMapping(source_field_name="metadata_storage_last_modified", target_field_name="metadata_storage_last_modified"),
FieldMapping(source_field_name="metadata_content_type", target_field_name="metadata_content_type"),
]
output_field_mappings = [
FieldMapping(source_field_name="/document/content", target_field_name= "content"),
]
# Define indexing parameters
indexing_parameters = IndexingParameters(
configuration={
"indexStorageMetadataOnlyForOversizedDocuments": True,
"failOnUnsupportedContentType": False,
"indexedFileNameExtensions": ".pdf,.docx,.txt,.json",
"parseJson": True,
"parsingMode": "default",
"allowSkillsetToReadFileData": False
}
)
indexer = SearchIndexer(
name=indexer_name,
data_source_name=data_source_name,
target_index_name=index_name,
skillset_name=skillset_name,
field_mappings=field_mappings,
output_field_mappings=output_field_mappings,
schedule=IndexingSchedule(interval="PT15M"),
parameters=indexing_parameters
)
indexer_client.create_indexer(indexer)
print(f"Indexer '{indexer_name}' created.")
</code>
# Function to create an indexer
def create_indexer(search_service_endpoint, search_service_api_key, indexer_name, data_source_name, index_name, skillset_name):
credential = AzureKeyCredential(search_service_api_key)
indexer_client = SearchIndexerClient(endpoint=search_service_endpoint, credential=credential)
field_mappings = [
FieldMapping(source_field_name="metadata_storage_path", target_field_name="metadata_storage_path"),
FieldMapping(source_field_name="metadata_storage_name", target_field_name="metadata_storage_name"),
FieldMapping(source_field_name="metadata_storage_last_modified", target_field_name="metadata_storage_last_modified"),
FieldMapping(source_field_name="metadata_content_type", target_field_name="metadata_content_type"),
]
output_field_mappings = [
FieldMapping(source_field_name="/document/content", target_field_name= "content"),
]
# Define indexing parameters
indexing_parameters = IndexingParameters(
configuration={
"indexStorageMetadataOnlyForOversizedDocuments": True,
"failOnUnsupportedContentType": False,
"indexedFileNameExtensions": ".pdf,.docx,.txt,.json",
"parseJson": True,
"parsingMode": "default",
"allowSkillsetToReadFileData": False
}
)
indexer = SearchIndexer(
name=indexer_name,
data_source_name=data_source_name,
target_index_name=index_name,
skillset_name=skillset_name,
field_mappings=field_mappings,
output_field_mappings=output_field_mappings,
schedule=IndexingSchedule(interval="PT15M"),
parameters=indexing_parameters
)
indexer_client.create_indexer(indexer)
print(f"Indexer '{indexer_name}' created.")
New contributor
karim Alameh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.