Add examples

tqchen · tqchen · commit 5703f8da6420 · 2023-05-20T12:57:49.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,8 @@
 *.a
 *.lib
 
+build
+
 # Executables
 *.exe
 *.out
diff --git a/README.md b/README.md
@@ -1,3 +1,84 @@
 # tokenizers-cpp
 
-Cross platform universal tokenizer binding to HF and sentencepiece
+This project provides a cross-platform C++ tokenizer binding library that can be universally deployed.
+It wraps and binds the [HuggingFace tokenizers library](https://github.com/huggingface/tokenizers)
+and [sentencepiece](https://github.com/google/sentencepiece) and provides a minimum common interface in C++.
+
+The main goal of the project is to enable tokenizer deployment for language model applications
+to native platforms with minimum dependencies and remove some of the barriers of
+cross-language bindings. This project is developed in part with and
+used in [MLC LLM](https://github.com/mlc-ai/mlc-llm). We have tested the following platforms:
+
+- iOS
+- Android
+- Windows
+- Linux
+- Web browser
+
+## Getting Started
+
+The easiest way is to add this project as a submodule and then
+include it via `add_sub_directory` in your CMake project.
+You also need to turn on `c++17` support.
+
+- First, you need to make sure you have rust installed.
+- If you are cross-compiling make sure you install the necessary target in rust.
+  For example, run `rustup target add aarch64-apple-ios` to install iOS target.
+- You can then link the libary
+
+See [example](example) folder for an example CMake project.
+
+### Example Code
+
+```c++
+// - dist/tokenizer.json
+void HuggingFaceTokenizerExample() {
+  // Read blob from file.
+  auto blob = LoadBytesFromFile("dist/tokenizer.json");
+  // Note: all the current factory APIs takes in-memory blob as input.
+  // This gives some flexibility on how these blobs can be read.
+  auto tok = Tokenizer::FromBlobJSON(blob);
+  std::string prompt = "What is the capital of Canada?";
+  // call Encode to turn prompt into token ids
+  std::vector<int> ids = tok->Encode(prompt);
+  // call Decode to turn ids into string
+  std::string decoded_prompt = tok->Decode(ids);
+}
+
+void SentencePieceTokenizerExample() {
+  // Read blob from file.
+  auto blob = LoadBytesFromFile("dist/tokenizer.model");
+  // Note: all the current factory APIs takes in-memory blob as input.
+  // This gives some flexibility on how these blobs can be read.
+  auto tok = Tokenizer::FromBlobSentencePiece(blob);
+  std::string prompt = "What is the capital of Canada?";
+  // call Encode to turn prompt into token ids
+  std::vector<int> ids = tok->Encode(prompt);
+  // call Decode to turn ids into string
+  std::string decoded_prompt = tok->Decode(ids);
+}
+```
+
+### Extra Details
+
+Currently, the project generates three static libraries
+- `libtokenizers_c.a`: the c binding to tokenizers rust library
+- `libsentencepice.a`: sentencepiece static library
+- `libtokenizers_cpp.a`: the cpp binding implementation
+
+If you are using an IDE, you can likely first use cmake to generate
+these libraries and add them to your development environment.
+If you are using cmake, `target_link_libraries(yourlib tokenizers_cpp)`
+will automatically links in the other two libraries.
+You can also checkout [MLC LLM](https://github.com/mlc-ai/mlc-llm)
+for as an example of complete LLM chat application integrations.
+
+## Javascript Support
+
+We use emscripten to expose tokenizer-cpp to wasm and javascript.
+Checkout [web](web) for more details.
+
+## Acknowledgements
+
+This project is only possible thanks to the shoulders open-source ecosystems that we stand on.
+This project is based on sentencepiece and tokenizers library.
diff --git a/example/.gitignore b/example/.gitignore
@@ -0,0 +1,2 @@
+build
+dist
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -0,0 +1,28 @@
+
+# Example cmake project
+cmake_minimum_required(VERSION 3.18)
+project(tokenizers_cpp_example C CXX)
+
+include(CheckCXXCompilerFlag)
+if(NOT MSVC)
+  check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17)
+  set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CUDA_STANDARD 17)
+else()
+  check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17)
+  set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CUDA_STANDARD 17)
+endif()
+
+# include tokenizer cpp as a sub directory
+set(TOKENZIER_CPP_PATH ..)
+add_subdirectory(${TOKENZIER_CPP_PATH} tokenizers EXCLUDE_FROM_ALL)
+
+add_executable(example example.cc)
+
+target_include_directories(example PRIVATE ${TOKENZIER_CPP_PATH}/include)
+
+# You can link tokenizers_cpp, it will automatically link tokenizers_c
+# and sentencepiece libary
+target_link_libraries(example PRIVATE tokenizers_cpp)
+
diff --git a/example/README.md b/example/README.md
@@ -0,0 +1,8 @@
+# Example Project
+
+This is an example cmake project to use tokenizers-cpp
+
+
+```bash
+./build_and_run.sh
+```
diff --git a/example/build_and_run.sh b/example/build_and_run.sh
@@ -0,0 +1,23 @@
+#/bin/bash
+
+# build
+mkdir -p build
+cd build
+cmake ..
+make -j8
+cd ..
+# get example files
+
+mkdir -p dist
+cd dist
+if [ ! -f "tokenizer.model" ]; then
+    wget https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model
+fi
+if [ ! -f "tokenizer.json" ]; then
+    wget https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1/resolve/main/tokenizer.json
+fi
+cd ..
+
+# run
+echo "---Running example----"
+./build/example
diff --git a/example/example.cc b/example/example.cc
@@ -0,0 +1,76 @@
+#include <tokenizers_cpp.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+using tokenizers::Tokenizer;
+
+std::string LoadBytesFromFile(const std::string& path) {
+  std::ifstream fs(path, std::ios::in | std::ios::binary);
+  if (fs.fail()) {
+    std::cerr << "Cannot open " << path << std::endl;
+    exit(1);
+  }
+  std::string data;
+  fs.seekg(0, std::ios::end);
+  size_t size = static_cast<size_t>(fs.tellg());
+  fs.seekg(0, std::ios::beg);
+  data.resize(size);
+  fs.read(data.data(), size);
+  return data;
+}
+
+void PrintEncodeResult(const std::vector<int>& ids) {
+  std::cout << "tokens=[";
+  for (size_t i = 0; i < ids.size(); ++i) {
+    if (i != 0) std::cout << ", ";
+    std::cout << ids[i];
+  }
+  std::cout << "]" << std::endl;
+}
+
+// Sentencepiece tokenizer
+// - dist/tokenizer.model
+void SentencePieceTokenizerExample() {
+  // Read blob from file.
+  auto blob = LoadBytesFromFile("dist/tokenizer.model");
+  // Note: all the current factory APIs takes in-memory blob as input.
+  // This gives some flexibility on how these blobs can be read.
+  auto tok = Tokenizer::FromBlobSentencePiece(blob);
+  std::string prompt = "What is the capital of Canada?";
+  // call Encode to turn prompt into token ids
+  std::vector<int> ids = tok->Encode(prompt);
+  // call Decode to turn ids into string
+  std::string decoded_prompt = tok->Decode(ids);
+
+  // print encoded result
+  std::cout << "SetencePiece tokenizer: " << std::endl;
+  PrintEncodeResult(ids);
+  std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
+}
+
+// HF tokenizer
+// - dist/tokenizer.json
+void HuggingFaceTokenizerExample() {
+  // Read blob from file.
+  auto blob = LoadBytesFromFile("dist/tokenizer.json");
+  // Note: all the current factory APIs takes in-memory blob as input.
+  // This gives some flexibility on how these blobs can be read.
+  auto tok = Tokenizer::FromBlobJSON(blob);
+  std::string prompt = "What is the capital of Canada?";
+  // call Encode to turn prompt into token ids
+  std::vector<int> ids = tok->Encode(prompt);
+  // call Decode to turn ids into string
+  std::string decoded_prompt = tok->Decode(ids);
+
+  // print encoded result
+  std::cout << "HF tokenizer: " << std::endl;
+  PrintEncodeResult(ids);
+  std::cout << "decode=\"" << decoded_prompt << "\"" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  SentencePieceTokenizerExample();
+  HuggingFaceTokenizerExample();
+}
diff --git a/include/tokenizers_c.h b/include/tokenizers_c.h
@@ -37,4 +37,4 @@ void tokenizers_free(TokenizerHandle handle);
 #ifdef __cplusplus
 }
 #endif
-#endif  // TOKENIZERS_C_H_
+#endif  // TOKENIZERS_C_H_
diff --git a/src/huggingface_tokenizer.cc b/src/huggingface_tokenizer.cc
@@ -4,10 +4,9 @@
  * \file huggingface_tokenizer.cc
  * \brief Huggingface tokenizer
  */
+#include <tokenizers_c.h>
 #include <tokenizers_cpp.h>
 
-#include "tokenizers_c.h"
-
 namespace tokenizers {
 /*!
  * \brief A simple c++ header of tokenizer via C API.

-Original file line number
+Diff line change
 *.a
 *.lib
 +build
++
 # Executables
 *.exe
 *.out
Original file line number	Diff line number	Diff line change
`@@ -37,4 +37,4 @@ void tokenizers_free(TokenizerHandle handle);`
`37`	`37`	`#ifdef __cplusplus`
`38`	`38`	`}`
`39`	`39`	`#endif`
`40`		`-#endif // TOKENIZERS_C_H_`
	`40`	`+#endif // TOKENIZERS_C_H_`