Understanding the LLM
As a 7B parameter model, the Mistral-7B-Instruct-v0.3 model offers remarkable performance while remaining deployable on standard hardware configurations. It requires approximately ~26-28 GB memory (13 GB for 7B parameters and an additional ~13 GB for optimizer states and overhead). A trn1.2xlarge
instance with 32GB memory is suitable for running the Mistral-7B model given the above requirements.
In this lab serving the Mistral-7B-Instruct-v0.3 model is implemented using FastAPI, Ray Serve, and PyTorch-based Hugging Face Transformers to create an API for text generation.
Here's the code for compiling the model that we'll use:
import os
import json
import logging
from fastapi import FastAPI
from ray import serve
import torch
import torch_neuronx
from transformers import AutoTokenizer
from transformers_neuronx.mistral.model import MistralForSampling
from huggingface_hub import snapshot_download
# Initialize FastAPI
app = FastAPI()
neuron_cores = int(os.getenv('NEURON_CORES', 2)) # Default to 2 for trn1.2xlarge
cacheDir = os.path.join('/tmp','model','neuron-mistral7bv0.3')
# --- Logging Setup ---
logger = logging.getLogger("ray.serve")
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
@serve.deployment(num_replicas=1)
@serve.ingress(app)
class APIIngress:
def __init__(self, mistral_model_handle):
self.handle = mistral_model_handle
@app.get("/infer")
async def infer(self, sentence: str):
result = await self.handle.infer.remote(sentence)
return result
@serve.deployment(
name="mistral-7b",
autoscaling_config={"min_replicas": 1, "max_replicas": 1},
ray_actor_options={
"resources": {"neuron_cores": neuron_cores}
}
)
class MistralModel:
def __init__(self):
try:
logger.info("Initializing model with pre-compiled files...")
mistral_model = os.getenv('MODEL_ID', 'askulkarni2/neuron-mistral7bv0.3')
logger.info(f"Using model ID: {mistral_model}")
model_path='/tmp/model/neuron-mistral7bv0.3'
model_cache='/tmp/model/cache'
# Initialize model state
self.neuron_model = None
self.tokenizer = None
#Downloading files to local dir
if not os.path.exists(model_path):
os.makedirs(cacheDir, exist_ok=True)
os.makedirs(model_cache, exist_ok=True)
logger.info("downloading model file to../tmp/model/neuron-mistral7bv0.3")
model_path = snapshot_download(repo_id=mistral_model, local_dir=cacheDir, local_dir_use_symlinks=False)
logger.info(f"model path: {model_path}")
logger.info(f"Checking model path contents: {os.listdir(model_path)}")
# Set the environment variable with absolute path
os.environ.update({
"NEURON_RT_VISIBLE_CORES": "0,1",
"NEURON_RT_NUM_CORES": "2",
"NEURON_RT_USE_PREFETCHED_NEFF": "1",
})
logger.info("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(
model_path,
local_files_only=True
)
# Set padding token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
logger.info("Set padding token to EOS token")
logger.info("Loading model...")
# Load model with minimal configuration
self.neuron_model = MistralForSampling.from_pretrained(
model_path, batch_size=1, tp_degree=2, amp='bf16'
)
logger.info("Model preparation...")
neuronxcc_dirs = [d for d in os.listdir(model_cache)]
if not neuronxcc_dirs:
# compile modele first time and save compile artifacts in cache dir
self.neuron_model.to_neuron()
self.neuron_model.save(model_cache)
else:
# load pre-complied .neff files
self.neuron_model.load(model_cache)
self.neuron_model.to_neuron()
logger.info("Model successfully prepared for inference")
# Verify initialization
if not self._verify_model_state():
raise RuntimeError("Model initialization failed verification")
logger.info("Model initialization complete")
except Exception as e:
logger.error(f"Error during model initialization: {e}")
raise
def _verify_model_state(self):
if self.neuron_model is None:
return False
if not hasattr(self.neuron_model, 'sample'):
return False
if self.tokenizer is None:
return False
return True
def infer(self, sentence: str):
input_ids = self.tokenizer.encode(sentence, return_tensors="pt")
with torch.inference_mode():
try:
logger.info(f"Performing inference on input: {sentence}")
generated_sequences = self.neuron_model.sample(
input_ids, sequence_length=2048, top_k=50
)
decoded_sequences = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in generated_sequences]
logger.info(f"Inference result: {decoded_sequences}")
return decoded_sequences
except Exception as e:
logger.error(f"Error during inference: {e}")
return {"error": "Inference failed"}
# Create an entry point for the FastAPI application
entrypoint = APIIngress.bind(MistralModel.bind())
This Python code performs the following tasks:
- Configures an APIIngress class responsible for handling inference requests
- Defines a MistralModel class responsible for managing the Mistral language model
- Loads and compiles the model based on existing parameters
- Creates an entry point for the FastAPI application
Through these steps the endpoint accepts input sentences and generates text outputs. The high performance efficiency in processing tasks enables the model to handle a wide variety of natural language processing applications, such as chat bots and text generation tasks.