Thiết kế website giá rẻ

Question

I am trying to integrate a custom NER model for my Rasa chatbot but I am having a hard time understanding how the SpacyTokenizer and SpacyFeaturizer in the SpacyNLP pipeline in Rasa are related to my custom NER component (en_CustomNer). I am asking this because my trained model only has “tok2vec” and “ner” components in its pipeline and I am not sure how this affects the initialization of SpacyTokenizer and SpacyFeaturizer in my chatbot pipeline. Here is my config.yml in Rasa (I am using Rasa version 3.6.18 and spaCy version 3.7.4):

language: en

pipeline:
    - name: SpacyNLP
      model: en_CustomNer  ## my custom NER model
    - name: SpacyTokenizer
    - name: SpacyFeaturizer
      pooling: mean
    - name: LexicalSyntacticFeaturizer
    - name: CountVectorsFeaturizer
    - name: CountVectorsFeaturizer
      analyzer: char_wb
      min_ngram: 2
      max_ngram: 4
    - name: DIETClassifier
      epochs: 150
      constrain_similarities: true
    - name: SpacyEntityExtractor
    - name: FallbackClassifier
      threshold: 0.1
      ambiguity_threshold: 0.1

I have trained my NER model using spaCy by creating a base-config.cfg file using the basic tutorial at here. I then followed a this tutorial on Rasa Blog rasa-spacy-integration and managed to make it work inside my Rasa pipeline. However, the Rasa blog post I followed takes a different approach and uses a pre-trained pipeline, only replacing the “ner” component for the custom spaCy model. This result in a larger model that has component like “tagger”, “parser”, “lemmatizer” on top of the 2 components (“tok2vec” and “ner”) my model has. I am wondering if there is a reason for this (maybe the Rasa pipeline makes use of the other pre-trained components?) and if there is a problem with my custom NER model only having these 2 components (tok2vec and ner)? Here is the full config.cfg I used for training my model:

[paths]
train = "./ner/train.spacy"
dev = "./ner/test.spacy"
vectors = "en_core_web_lg"
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
before_update = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.WandbLogger.v3"
project_name = "new_custom_ner"
remove_config_values = []
log_dataset_dir = "./output"
model_log_interval = 1000
entity = null
run_name = null

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

Should I replace my model or is it ok the way it is right now? I am worried that my custom model affects the way the SpacyTokenizer and SpacyFeaturizer work.

Thiết kế website giá rẻ

Danh mục

Custom spaCy NLP model inside Rasa SpacyNLP pipeline