Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 35 additions & 37 deletions QEfficient/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,50 @@
import os
import warnings

from QEfficient.utils import custom_format_warning

# For faster downloads via hf_transfer
# This code is put above import statements as this needs to be executed before
# hf_transfer is imported (will happen on line 15 via leading imports)
# hf_transfer is imported (will happen on line 14 via leading imports)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Placeholder for all non-transformer models registered in QEfficient
import QEfficient.utils.model_registery # noqa: F401
from QEfficient.base import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform
from QEfficient.utils import custom_format_warning
from QEfficient.utils.logging_utils import logger

# custom warning for the better logging experience
warnings.formatwarning = custom_format_warning

# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter

__all__ = [
"transform",
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
]


def check_qaic_sdk():
"""Check if QAIC SDK is installed"""
Expand All @@ -36,38 +67,5 @@ def check_qaic_sdk():
return False


# Conditionally import QAIC-related modules if the SDK is installed
__version__ = "0.0.1.dev0"

if check_qaic_sdk():
from QEfficient.base import (
QEFFAutoModel,
QEFFAutoModelForCausalLM,
QEFFAutoModelForImageTextToText,
QEFFAutoModelForSpeechSeq2Seq,
QEFFCommonLoader,
)
from QEfficient.compile.compile_helper import compile
from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.peft import QEffAutoPeftModelForCausalLM
from QEfficient.transformers.transform import transform

# Users can use QEfficient.export for exporting models to ONNX
export = qualcomm_efficient_converter

__all__ = [
"transform",
"export",
"compile",
"cloud_ai_100_exec_kv",
"QEFFAutoModel",
"QEFFAutoModelForCausalLM",
"QEffAutoPeftModelForCausalLM",
"QEFFAutoModelForImageTextToText",
"QEFFAutoModelForSpeechSeq2Seq",
"QEFFCommonLoader",
]

else:
if not check_qaic_sdk():
logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
139 changes: 78 additions & 61 deletions QEfficient/generation/cloud_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,40 @@
#
# -----------------------------------------------------------------------------

import importlib
import platform
import sys
from pathlib import Path
from typing import Dict, List, Optional, Union
from warnings import warn

import numpy as np

try:
import qaicrt
except ImportError:
import platform
import sys

sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
import qaicrt

try:
import QAicApi_pb2 as aicapi
except ImportError:
import sys

sys.path.append("/opt/qti-aic/dev/python")
import QAicApi_pb2 as aicapi
class QAICInferenceSession:
_qaicrt = None
_aicapi = None

aic_to_np_dtype_mapping = {
aicapi.FLOAT_TYPE: np.dtype(np.float32),
aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
aicapi.INT8_Q_TYPE: np.dtype(np.int8),
aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
aicapi.INT16_Q_TYPE: np.dtype(np.int16),
aicapi.INT32_Q_TYPE: np.dtype(np.int32),
aicapi.INT32_I_TYPE: np.dtype(np.int32),
aicapi.INT64_I_TYPE: np.dtype(np.int64),
aicapi.INT8_TYPE: np.dtype(np.int8),
}
@property
def qaicrt(self):
if QAICInferenceSession._qaicrt is None:
try:
QAICInferenceSession._qaicrt = importlib.import_module("qaicrt")
except ImportError:
sys.path.append(f"/opt/qti-aic/dev/lib/{platform.machine()}")
QAICInferenceSession._qaicrt = importlib.import_module("qaicrt")
return QAICInferenceSession._qaicrt

@property
def aicapi(self):
if QAICInferenceSession._aicapi is None:
try:
QAICInferenceSession._aicapi = importlib.import_module("QAicApi_pb2")
except ImportError:
sys.path.append("/opt/qti-aic/dev/python")
QAICInferenceSession._aicapi = importlib.import_module("QAicApi_pb2")
return QAICInferenceSession._aicapi

class QAICInferenceSession:
def __init__(
self,
qpc_path: Union[Path, str],
Expand All @@ -58,59 +55,81 @@ def __init__(
:activate: bool. If false, activation will be disabled. Default=True.
:enable_debug_logs: bool. If True, It will enable debug logs. Default=False.
"""

# Build the dtype map one time, not on every property access
self.aic_to_np_dtype_mapping = {
self.aicapi.FLOAT_TYPE: np.dtype(np.float32),
self.aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
self.aicapi.INT8_Q_TYPE: np.dtype(np.int8),
self.aicapi.UINT8_Q_TYPE: np.dtype(np.uint8),
self.aicapi.INT16_Q_TYPE: np.dtype(np.int16),
self.aicapi.INT32_Q_TYPE: np.dtype(np.int32),
self.aicapi.INT32_I_TYPE: np.dtype(np.int32),
self.aicapi.INT64_I_TYPE: np.dtype(np.int64),
self.aicapi.INT8_TYPE: np.dtype(np.int8),
}

# Load QPC
if device_ids is not None:
devices = qaicrt.QIDList(device_ids)
self.context = qaicrt.Context(devices)
self.queue = qaicrt.Queue(self.context, device_ids[0])
devices = self.qaicrt.QIDList(device_ids)
self.context = self.qaicrt.Context(devices)
self.queue = self.qaicrt.Queue(self.context, device_ids[0])
else:
self.context = qaicrt.Context()
self.queue = qaicrt.Queue(self.context, 0) # Async API
self.context = self.qaicrt.Context()
self.queue = self.qaicrt.Queue(self.context, 0) # Async API

if enable_debug_logs:
if self.context.setLogLevel(qaicrt.QLogLevel.QL_DEBUG) != qaicrt.QStatus.QS_SUCCESS:
if self.context.setLogLevel(self.qaicrt.QLogLevel.QL_DEBUG) != self.qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to setLogLevel")
qpc = qaicrt.Qpc(str(qpc_path))

qpc = self.qaicrt.Qpc(str(qpc_path))

# Load IO Descriptor
iodesc = aicapi.IoDesc()
iodesc = self.aicapi.IoDesc()
status, iodesc_data = qpc.getIoDescriptor()
if status != qaicrt.QStatus.QS_SUCCESS:
if status != self.qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to getIoDescriptor")
iodesc.ParseFromString(bytes(iodesc_data))

self.allowed_shapes = [
[(aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
[(self.aic_to_np_dtype_mapping[x.type].itemsize, list(x.dims)) for x in allowed_shape.shapes]
for allowed_shape in iodesc.allowed_shapes
]
self.bindings = iodesc.selected_set.bindings
self.binding_index_map = {binding.name: binding.index for binding in self.bindings}

# Create and load Program
prog_properties = qaicrt.QAicProgramProperties()
prog_properties = self.qaicrt.QAicProgramProperties()
prog_properties.SubmitRetryTimeoutMs = 60_000
if device_ids and len(device_ids) > 1:
prog_properties.devMapping = ":".join(map(str, device_ids))
self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
if self.program.load() != qaicrt.QStatus.QS_SUCCESS:

self.program = self.qaicrt.Program(self.context, None, qpc, prog_properties)
if self.program.load() != self.qaicrt.QStatus.QS_SUCCESS:
raise RuntimeError("Failed to load program")

if activate:
self.activate()

# Create input qbuffers and buf_dims
self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
self.buf_dims = qaicrt.BufferDimensionsVecRef(
[(aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
self.qbuffers = [self.qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
self.buf_dims = self.qaicrt.BufferDimensionsVecRef(
[(self.aic_to_np_dtype_mapping[binding.type].itemsize, list(binding.dims)) for binding in self.bindings]
)

@property
def input_names(self) -> List[str]:
return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_INPUT]
return [binding.name for binding in self.bindings if binding.dir == self.aicapi.BUFFER_IO_TYPE_INPUT]

@property
def output_names(self) -> List[str]:
return [binding.name for binding in self.bindings if binding.dir == aicapi.BUFFER_IO_TYPE_OUTPUT]
return [binding.name for binding in self.bindings if binding.dir == self.aicapi.BUFFER_IO_TYPE_OUTPUT]

def activate(self):
"""Activate qpc"""

self.program.activate()
self.execObj = qaicrt.ExecObj(self.context, self.program)
self.execObj = self.qaicrt.ExecObj(self.context, self.program)

def deactivate(self):
"""Deactivate qpc"""
Expand All @@ -131,7 +150,7 @@ def set_buffers(self, buffers: Dict[str, np.ndarray]):
warn(f'Buffer: "{buffer_name}" not found')
continue
buffer_index = self.binding_index_map[buffer_name]
self.qbuffers[buffer_index] = qaicrt.QBuffer(buffer.tobytes())
self.qbuffers[buffer_index] = self.qaicrt.QBuffer(buffer.tobytes())
self.buf_dims[buffer_index] = (
buffer.itemsize,
buffer.shape if len(buffer.shape) > 0 else (1,),
Expand All @@ -157,21 +176,19 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
Return:
:Dict[str, np.ndarray]:
"""
# Set inputs

self.set_buffers(inputs)
if self.execObj.setData(self.qbuffers, self.buf_dims) != qaicrt.QStatus.QS_SUCCESS:
if self.execObj.setData(self.qbuffers, self.buf_dims) != self.qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to setData")
# # Run with sync API
# if self.execObj.run(self.qbuffers) != qaicrt.QStatus.QS_SUCCESS:
# Run with async API
if self.queue.enqueue(self.execObj) != qaicrt.QStatus.QS_SUCCESS:

if self.queue.enqueue(self.execObj) != self.qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to enqueue")
if self.execObj.waitForCompletion() != qaicrt.QStatus.QS_SUCCESS:

if self.execObj.waitForCompletion() != self.qaicrt.QStatus.QS_SUCCESS:
error_message = "Failed to run"
# Print additional error messages for unmatched dimension error

if self.allowed_shapes:
error_message += "\n\n"
error_message += '(Only if "No matching dimension found" error is present above)'
error_message += "\n\n(Only if 'No matching dimension found' error is present above)"
error_message += "\nAllowed shapes:"
for i, allowed_shape in enumerate(self.allowed_shapes):
error_message += f"\n{i}\n"
Expand All @@ -189,18 +206,18 @@ def run(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
continue
error_message += f"{binding.name}:\t{elemsize}\t{shape}\n"
raise ValueError(error_message)
# Get output buffers

status, output_qbuffers = self.execObj.getData()
if status != qaicrt.QStatus.QS_SUCCESS:
if status != self.qaicrt.QStatus.QS_SUCCESS:
raise MemoryError("Failed to getData")
# Build output

outputs = {}
for output_name in self.output_names:
buffer_index = self.binding_index_map[output_name]
if self.qbuffers[buffer_index].size == 0:
continue
outputs[output_name] = np.frombuffer(
bytes(output_qbuffers[buffer_index]),
aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
self.aic_to_np_dtype_mapping[self.bindings[buffer_index].type],
).reshape(self.buf_dims[buffer_index][1])
return outputs
Loading