Skip to content

Commit eda319e

Browse files
authored
Remove old tokenizer/ directory in ExecuTorch
Differential Revision: D72007597 Pull Request resolved: #9728
1 parent eee2bf1 commit eda319e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+24
-130948
lines changed

.ci/scripts/test_llama.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
269269

270270
# Create tokenizer.bin.
271271
echo "Creating tokenizer.bin"
272-
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
272+
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
273273

274274

275275
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=tokenizer.bin --prompt=Once --temperature=0 --seq_len=10 --warmup=1"

.ci/scripts/test_llama_torchao_lowbit.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ cmake --build cmake-out/examples/models/llama -j16 --config Release
5555
download_stories_model_artifacts
5656

5757
echo "Creating tokenizer.bin"
58-
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
58+
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
5959

6060
# Export model
6161
LLAMA_CHECKPOINT=stories110M.pt

.ci/scripts/test_phi_3_mini.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ cmake_build_phi_3_mini() {
5656
prepare_tokenizer() {
5757
echo "Downloading and converting tokenizer.model"
5858
wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true"
59-
$PYTHON_EXECUTABLE -m executorch.extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
59+
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
6060
}
6161

6262
# Export phi-3-mini model to pte

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pip install graphviz
3030
# Download stories llama110m artifacts
3131
download_stories_model_artifacts
3232
echo "Creating tokenizer.bin"
33-
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
33+
$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
3434

3535
set +e
3636
# Compile only as weight sharing is not applicable on x86

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,7 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
757757
endif()
758758

759759
if(EXECUTORCH_BUILD_EXTENSION_LLM)
760-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
760+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizers)
761761
endif()
762762

763763
if(EXECUTORCH_BUILD_EXTENSION_MODULE)

examples/models/llama/TARGETS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ runtime.python_library(
202202
":export_library",
203203
"//executorch/examples/models/llama/tokenizer:tiktoken_py",
204204
"//executorch/extension/llm/export:export_lib",
205-
"//executorch/extension/llm/tokenizer:tokenizer_py_lib",
205+
"//pytorch/tokenizers/pytorch_tokenizers:tokenizers",
206206
"//executorch/extension/pybindings:portable_lib",
207207
],
208208
)

examples/models/llama/eval_llama_lib.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,12 @@
1515
from executorch.examples.models.llama.export_llama_lib import (
1616
get_quantizer_and_quant_params,
1717
)
18-
from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken
1918

2019
from executorch.extension.llm.export.builder import LLMEdgeManager
21-
from executorch.extension.llm.tokenizer.tokenizer import (
22-
Tokenizer as SentencePieceTokenizer,
23-
)
24-
from executorch.extension.llm.tokenizer.utils import get_tokenizer
2520
from lm_eval.evaluator import simple_evaluate
21+
from pytorch_tokenizers import get_tokenizer
22+
from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
23+
from pytorch_tokenizers.tiktoken import TiktokenTokenizer as Tiktoken
2624
from torch.nn import CrossEntropyLoss
2725
from tqdm import tqdm
2826

examples/models/llama/evaluate/eager_eval.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,10 @@
88
from typing import Optional, Union
99

1010
import torch
11-
from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken
12-
from executorch.extension.llm.tokenizer.tokenizer import (
13-
Tokenizer as SentencePieceTokenizer,
14-
)
1511

1612
from lm_eval.models.huggingface import HFLM as eval_wrapper
13+
from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
14+
from pytorch_tokenizers.tiktoken import TiktokenTokenizer as Tiktoken
1715

1816
from torch import nn
1917

examples/models/llama/runner/generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import torch
1212

13-
from executorch.extension.llm.tokenizer.utils import get_tokenizer
13+
from pytorch_tokenizers import get_tokenizer
1414

1515

1616
def sample_top_p(probs, p):

examples/models/llama/tokenizer/targets.bzl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ def define_common_targets():
1616
],
1717
exported_deps = [
1818
"//pytorch/tokenizers:tiktoken",
19-
"//executorch/extension/llm/tokenizer:tiktoken", # TODO: remove
2019
],
2120
visibility = [
2221
"@EXECUTORCH_CLIENTS",

examples/models/llama/tokenizer/test/test_tiktoken.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
#include <vector>
1212

13-
#include <executorch/extension/llm/tokenizer/tiktoken.h>
13+
#include <pytorch/tokenizers/tiktoken.h>
1414

1515
#include <gtest/gtest.h>
1616

@@ -21,9 +21,9 @@
2121
using namespace ::testing;
2222

2323
using ::example::Version;
24-
using ::executorch::extension::llm::Tokenizer;
25-
using ::executorch::runtime::Error;
26-
using ::executorch::runtime::Result;
24+
using ::tokenizers::Error;
25+
using ::tokenizers::Result;
26+
using ::tokenizers::Tokenizer;
2727

2828
static std::string get_resource_path(const std::string& name) {
2929
#ifdef EXECUTORCH_FB_BUCK
@@ -36,7 +36,7 @@ static std::string get_resource_path(const std::string& name) {
3636
class MultimodalTiktokenV5ExtensionTest : public Test {
3737
public:
3838
void SetUp() override {
39-
tokenizer_ = std::make_unique<executorch::extension::llm::Tiktoken>(
39+
tokenizer_ = std::make_unique<tokenizers::Tiktoken>(
4040
example::get_multimodal_special_tokens(), 0, 1);
4141
modelPath_ = get_resource_path("test_tiktoken_tokenizer.model");
4242
}

examples/models/llava/export_llava.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@
4646
)
4747

4848
from executorch.extension.llm.export.builder import DType, LLMEdgeManager
49-
from executorch.extension.llm.tokenizer.tokenizer import Tokenizer
5049
from executorch.util.activation_memory_profiler import generate_memory_trace
50+
from pytorch_tokenizers.llama2c import Llama2cTokenizer as Tokenizer
5151
from torch.export import Dim
5252
from torch.nn.attention import SDPBackend
5353

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# LICENSE file in the root directory of this source tree.
66

77
# model sharding with custom op
8-
set(CUSTOM_OP_SRCS_FILE
8+
set(CUSTOM_OP_SRCS_FILE
99
"${EXECUTORCH_SOURCE_DIR}/extension/llm/custom_ops/op_fallback.cpp"
1010
)
1111
add_library(custom_ops ${CUSTOM_OP_SRCS_FILE})
@@ -35,7 +35,7 @@ list(
3535
list(
3636
APPEND
3737
_llama_runner__srcs
38-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
38+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp
3939
${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp
4040
)
4141

extension/llm/export/TARGETS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,6 @@ runtime.python_library(
4040
"//executorch/exir:lib",
4141
"//executorch/exir/backend:backend_details",
4242
"//executorch/extension/export_util:export_util",
43-
"//executorch/extension/llm/tokenizer:tokenizer_py_lib",
43+
"//pytorch/tokenizers/pytorch_tokenizers:tokenizers",
4444
],
4545
)

extension/llm/export/builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from executorch.extension.export_util.utils import export_to_edge, save_pte_program
3636

3737
from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
38-
from executorch.extension.llm.tokenizer.utils import get_tokenizer
38+
from pytorch_tokenizers import get_tokenizer
3939
from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
4040
from torch.ao.quantization.quantizer import Quantizer
4141
from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer

extension/llm/tokenizer/CMakeLists.txt

Lines changed: 0 additions & 62 deletions
This file was deleted.

extension/llm/tokenizer/TARGETS

Lines changed: 0 additions & 8 deletions
This file was deleted.

extension/llm/tokenizer/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)