1

I am looking to convert flan-T5 model downloaded from Hugging face into onnx format and make inference with the same.

My input data is the symptoms of disease and expected output is the Disease name

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import onnx

# Set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")

  

# Export the model to ONNX format
onnx_path = "flan-t5-xl.onnx"
dummy_input = tokenizer("What's the disease name in this text: Example text", return_tensors="pt", padding=True).to(device)
dummy_input_ids = dummy_input["input_ids"]
dummy_attention_mask = dummy_input["attention_mask"]
dummy_decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)

with torch.no_grad():
    torch.onnx.export(
        model,
        (dummy_input_ids, dummy_attention_mask, dummy_decoder_input_ids),
        onnx_path,
        opset_version=11,
        input_names=["input_ids", "attention_mask", "decoder_input_ids"],
        output_names=["output"],
        dynamic_axes={
            "input_ids": {0: "batch_size"},
            "attention_mask": {0: "batch_size"},
            "decoder_input_ids": {0: "batch_size"},
            "output": {0: "batch_size", 1: "sequence_length"},
        },
    )
print(f"Model saved to {onnx_path}")

# Inference using the ONNX model on GPU

import onnxruntime

onnx_model = onnxruntime.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"]

)

InvalidGraph: [ONNXRuntimeError] : 10 : INVALID_GRAPH : Load model from flan-t5-xl.onnx failed:This is an invalid model. Type Error: Type 'tensor(int64)' of input parameter (/decoder/block.0/layer.0/SelfAttention/Sub_output_0) of operator (Min) in node (/decoder/block.0/layer.0/SelfAttention/Min) is invalid.

input_text = input("Enter Disease/Symptom Detail: ")
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)

onnx_inputs = {
    "input_ids": input_ids.cpu().numpy(),
    "attention_mask": attention_mask.cpu().numpy(),
    "decoder_input_ids": decoder_input_ids.cpu().numpy(),
}

onnx_output = onnx_model.run(None, onnx_inputs)[0]
decoded_output = tokenizer.decode(onnx_output[0], skip_special_tokens=True)

print('-' * 100)
print(f"Name of Disease based on Entered Text: {decoded_output}")

1 Answer 1

1

Use https://huggingface.co/datasets/bakks/flan-t5-onnx instead.

And to convert the google/flan-t5, see https://huggingface.co/datasets/bakks/flan-t5-onnx/blob/main/exportt5.py

from pathlib import Path
import transformers as t
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

# print out the version of the transformers library
print("transformers version:", t.__version__)



models = [
    #"google/flan-t5-small",
    #"google/flan-t5-base",
    #"google/flan-t5-large",
    "google/flan-t5-xl",
    "google/flan-t5-xxl",
]

for model_id in models:
    model_name = model_id.split("/")[1]
    onnx_path = Path("onnx/" + model_name)

    # load vanilla transformers and convert to onnx
    model = ORTModelForSeq2SeqLM.from_pretrained(model_id, from_transformers=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # save onnx checkpoint and tokenizer
    model.save_pretrained(onnx_path)
    tokenizer.save_pretrained(onnx_path)

Then try again:

import onnxruntime

onnx_model = onnxruntime.InferenceSession(
  onnx_path, providers=["CUDAExecutionProvider"]
)

Not the answer you're looking for? Browse other questions tagged or ask your own question.