I am looking to convert flan-T5 model downloaded from Hugging face into onnx format and make inference with the same.
My input data is the symptoms of disease and expected output is the Disease name
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import onnx
# Set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
# Export the model to ONNX format
onnx_path = "flan-t5-xl.onnx"
dummy_input = tokenizer("What's the disease name in this text: Example text", return_tensors="pt", padding=True).to(device)
dummy_input_ids = dummy_input["input_ids"]
dummy_attention_mask = dummy_input["attention_mask"]
dummy_decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)
with torch.no_grad():
torch.onnx.export(
model,
(dummy_input_ids, dummy_attention_mask, dummy_decoder_input_ids),
onnx_path,
opset_version=11,
input_names=["input_ids", "attention_mask", "decoder_input_ids"],
output_names=["output"],
dynamic_axes={
"input_ids": {0: "batch_size"},
"attention_mask": {0: "batch_size"},
"decoder_input_ids": {0: "batch_size"},
"output": {0: "batch_size", 1: "sequence_length"},
},
)
print(f"Model saved to {onnx_path}")
# Inference using the ONNX model on GPU
import onnxruntime
onnx_model = onnxruntime.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"]
)
InvalidGraph: [ONNXRuntimeError] : 10 : INVALID_GRAPH : Load model from flan-t5-xl.onnx failed:This is an invalid model. Type Error: Type 'tensor(int64)' of input parameter (/decoder/block.0/layer.0/SelfAttention/Sub_output_0) of operator (Min) in node (/decoder/block.0/layer.0/SelfAttention/Min) is invalid.
input_text = input("Enter Disease/Symptom Detail: ")
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)
onnx_inputs = {
"input_ids": input_ids.cpu().numpy(),
"attention_mask": attention_mask.cpu().numpy(),
"decoder_input_ids": decoder_input_ids.cpu().numpy(),
}
onnx_output = onnx_model.run(None, onnx_inputs)[0]
decoded_output = tokenizer.decode(onnx_output[0], skip_special_tokens=True)
print('-' * 100)
print(f"Name of Disease based on Entered Text: {decoded_output}")