|
6 | 6 | * LICENSE file in the root directory of this source tree.
|
7 | 7 | */
|
8 | 8 |
|
9 |
| -#include <executorch/examples/models/llava/runner/llava_runner.h> |
| 9 | +#include <executorch/extension/llm/runner/image.h> |
| 10 | +#include <executorch/extension/llm/runner/multimodal_input.h> |
| 11 | +#include <executorch/extension/llm/runner/multimodal_runner.h> |
10 | 12 | #include <gflags/gflags.h>
|
| 13 | +#include <pytorch/tokenizers/llama2c_tokenizer.h> |
11 | 14 | #define STB_IMAGE_IMPLEMENTATION
|
12 | 15 | #include <stb_image.h>
|
13 | 16 | #define STB_IMAGE_RESIZE_IMPLEMENTATION
|
@@ -44,7 +47,10 @@ DEFINE_int32(
|
44 | 47 | -1,
|
45 | 48 | "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
|
46 | 49 |
|
47 |
| -using executorch::extension::llm::Image; |
| 50 | +using ::executorch::extension::llm::Image; |
| 51 | +using ::executorch::extension::llm::make_image_input; |
| 52 | +using ::executorch::extension::llm::make_text_input; |
| 53 | +using ::executorch::extension::llm::MultimodalInput; |
48 | 54 |
|
49 | 55 | void load_image(const std::string& image_path, Image& image) {
|
50 | 56 | int width, height, channels;
|
@@ -127,14 +133,53 @@ int32_t main(int32_t argc, char** argv) {
|
127 | 133 | ->_unsafe_reset_threadpool(num_performant_cores);
|
128 | 134 | }
|
129 | 135 | #endif
|
130 |
| - // create llama runner |
131 |
| - example::LlavaRunner runner(model_path, tokenizer_path, temperature); |
| 136 | + // Load tokenizer |
| 137 | + std::unique_ptr<::tokenizers::Tokenizer> tokenizer = |
| 138 | + std::make_unique<tokenizers::Llama2cTokenizer>(); |
| 139 | + tokenizer->load(tokenizer_path); |
| 140 | + if (tokenizer == nullptr) { |
| 141 | + ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path); |
| 142 | + return 1; |
| 143 | + } |
| 144 | + |
| 145 | + // Create multimodal runner |
| 146 | + std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner = |
| 147 | + ::executorch::extension::llm::create_multimodal_runner( |
| 148 | + model_path, std::move(tokenizer)); |
| 149 | + if (runner == nullptr) { |
| 150 | + ET_LOG(Error, "Failed to create multimodal runner"); |
| 151 | + return 1; |
| 152 | + } |
132 | 153 |
|
| 154 | + // Load runner |
| 155 | + auto load_error = runner->load(); |
| 156 | + if (load_error != ::executorch::runtime::Error::Ok) { |
| 157 | + ET_LOG(Error, "Failed to load multimodal runner"); |
| 158 | + return 1; |
| 159 | + } |
| 160 | + |
| 161 | + // Prepare inputs |
| 162 | + static const char* kPresetPrompt = |
| 163 | + "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "; |
133 | 164 | Image image;
|
134 | 165 | load_image(image_path, image);
|
135 |
| - std::vector<Image> images = {image}; |
| 166 | + std::vector<MultimodalInput> inputs = { |
| 167 | + make_text_input(std::string(kPresetPrompt)), |
| 168 | + make_image_input(image), |
| 169 | + make_text_input(std::string(prompt)), |
| 170 | + }; |
| 171 | + |
| 172 | + ::executorch::extension::llm::GenerationConfig config; |
| 173 | + config.temperature = temperature; |
| 174 | + |
| 175 | + // Generate |
| 176 | + ET_LOG(Info, "Starting generation..."); |
| 177 | + auto error = runner->generate(inputs, config); |
| 178 | + if (error != ::executorch::runtime::Error::Ok) { |
| 179 | + ET_LOG(Error, "Failed to generate with multimodal runner"); |
| 180 | + return 1; |
| 181 | + } |
136 | 182 |
|
137 |
| - // generate |
138 |
| - runner.generate(std::move(images), prompt, seq_len); |
| 183 | + printf("\n"); |
139 | 184 | return 0;
|
140 | 185 | }
|
0 commit comments