Skip to content

Commit 0874123

Browse files
committed
[Llava] Add csv image loading in C++ runner
Rationale - We don't want to depend on Torch when building for Android. This add two things, (1) for AoT, python image_util optionally generates a .csv from a jpg. In addition to .pt. (2) add a runtime runner flag which hints at the provided image is a csv. And if so, the runner parses the csv and feeds it to the model. This is very naive and obviously fragile. Added some checks in python. Tested few ways, - On M1, with torch, loaded both .pt and .csv generated from the same jpg. And the LLM produces same text. - On Android, without torch, loaded .csv and it also produces similar text.:wq
1 parent 444480b commit 0874123

File tree

4 files changed

+217
-67
lines changed

4 files changed

+217
-67
lines changed

.ci/scripts/test_llava.sh

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ if [[ "${TARGET_OS_lower}" == "android" ]]; then
2525
exit 1
2626
fi
2727
fi
28+
PTE=llava.pte
29+
IMAGE_PT=image.pt
30+
IMAGE_CSV=image.csv
31+
TOKENIZER=tokenizer.bin
2832

2933
# Number of processes for a parallel build
3034
NPROC=8
@@ -96,7 +100,7 @@ cmake_build_llava_runner_for_android() {
96100
-DANDROID_PLATFORM=android-23 \
97101
${LLAVA_COMMON_CMAKE_ARGS} \
98102
-DCMAKE_PREFIX_PATH="$python_lib" \
99-
-DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \
103+
-DBUILD_LLAVA_RUNNER_WITHOUT_TORCH=ON \
100104
-B${BUILD_DIR}/${dir} \
101105
${dir}
102106

@@ -106,67 +110,87 @@ cmake_build_llava_runner_for_android() {
106110
# only export the one without custom op for now since it's
107111
export_llava() {
108112
echo "Starting to export Llava. This will take about 6 mins"
109-
$PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
113+
$PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name ${PTE} --with-artifacts
110114
}
111115

112116
# Download a new image with different size, to test if the model can handle different image sizes
113117
prepare_image_tensor() {
114118
echo "Downloading image"
115119
curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
116-
$PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
120+
$PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path ${IMAGE_PT} --output-csv ${IMAGE_CSV}
117121
}
118122

119123
run_and_verify() {
120124
NOW=$(date +"%H:%M:%S")
121125
echo "Starting to run llava runner at ${NOW}"
122-
if [[ ! -f "llava.pte" ]]; then
123-
echo "Export failed. Abort"
126+
if [[ ! -f "${PTE}" ]]; then
127+
echo "Could not file PTE: ${PTE}. Export failed. Aborting."
124128
exit 1
125129
fi
126-
if [[ ! -f "image.pt" ]]; then
127-
echo "image.pt is missing."
130+
if [[ ! -f "${IMAGE_PT}" ]]; then
131+
echo ".pt Image file: ${IMAGE_PT} is missing."
128132
exit 1
129133
fi
130-
if [[ ! -f "tokenizer.bin" ]]; then
131-
echo "tokenizer.bin is missing."
134+
if [[ ! -f "${IMAGE_CSV}" ]]; then
135+
echo "csv Image file ${IMAGE_CSV} is missing."
136+
exit 1
137+
fi
138+
if [[ ! -f "${TOKENIZER}" ]]; then
139+
echo "Tokenizer: ${TOKENIZER} is missing."
132140
exit 1
133141
fi
134142

135-
136-
137-
RUNTIME_ARGS="--model_path=llava.pte \
138-
--tokenizer_path=tokenizer.bin \
139-
--image_path=image.pt \
140-
--prompt=ASSISTANT: \
141-
--temperature=0 \
143+
RUNTIME_ARGS="--model_path=${PTE} \
144+
--tokenizer_path=${TOKENIZER} \
145+
--prompt=ASSISTANT: \
146+
--temperature=0 \
142147
--seq_len=650"
143148

144149
if [[ "${TARGET_OS_lower}" == "android" ]]; then
145-
echo "Transfer relevant files to the phone via ADB and run llava_main with following args,"
146-
echo "$ llava_main ${RUNTIME_ARGS} "
150+
echo "Transfer relevant files to the phone via ADB and run llava_main binary."
147151
exit 0;
148152
fi
149153

150-
${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
154+
${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} \
155+
--image_path=${IMAGE_PT} > result.txt
156+
157+
${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} \
158+
--image_path=${IMAGE_CSV} --is_csv_image=true > result_csv.txt
151159

152160
# verify result.txt
153161
RESULT=$(cat result.txt)
162+
RESULT_CSV=$(cat result_csv.txt)
154163
# set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
155164
if [[ "$(uname)" == "Darwin" ]]; then
156165
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
166+
# TODO: This is what it produces on M1.
167+
# EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with a group of men playing on a basketball court. There are at least nine players on the court, each actively"
157168
else
158169
# set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
159170
EXPECTED_PREFIX="ASSISTANT:"
160171
fi
161172
if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
162173
echo "Expected result prefix: ${EXPECTED_PREFIX}"
163174
echo "Actual result: ${RESULT}"
175+
echo "Actual result (csv): ${RESULT_CSV}"
164176
echo "Success"
165-
exit 0
166177
else
167178
echo "Expected result prefix: ${EXPECTED_PREFIX}"
168179
echo "Actual result: ${RESULT}"
169-
echo "Failure; results not the same"
180+
echo "Actual result (csv): ${RESULT_CSV}"
181+
echo "Failure; generated text is not as expected"
182+
exit 1
183+
fi
184+
185+
if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
186+
echo ".pt and .csv image produced same output"
187+
echo "Success"
188+
exit 0
189+
else
190+
echo ".pt and .csv image inputs produced different outputs"
191+
echo ".pt image produced: ${RESULT}"
192+
echo ".csv image produced: ${RESULT_CSV}"
193+
echo "Failure"
170194
exit 1
171195
fi
172196
}
@@ -187,4 +211,7 @@ export_llava
187211

188212
# Step3. Run
189213
prepare_image_tensor
190-
run_and_verify
214+
# Only run if building for native
215+
if [[ "${TARGET_OS_lower}" == "native" ]]; then
216+
run_and_verify
217+
fi

examples/models/llava/CMakeLists.txt

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ project(llava)
2222
option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
2323

2424
# This is a temporary hack to get around Torch dep so we can test this on android
25-
option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)
25+
option(BUILD_LLAVA_RUNNER_WITHOUT_TORCH "Build Llava runner without torch dep" OFF)
2626

2727
include(CMakeDependentOption)
2828
#
@@ -74,10 +74,9 @@ set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
7474
find_package(gflags REQUIRED)
7575

7676
# Avoid torch dep from torch.load()-ing the image.
77-
# This is a temporary hack.
78-
if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
79-
add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
80-
message("Buidling the runner without Torch, feeding a dummy image!")
77+
# Use a CSV formatted image instead
78+
if(BUILD_LLAVA_RUNNER_WITHOUT_TORCH)
79+
add_definitions(-DBUILD_LLAVA_RUNNER_WITHOUT_TORCH=1)
8180
else()
8281
find_package(Torch CONFIG REQUIRED)
8382
endif()
@@ -106,7 +105,7 @@ endif()
106105
add_subdirectory(runner)
107106

108107
set(LINK_LIBS gflags)
109-
if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
108+
if(NOT BUILD_LLAVA_RUNNER_WITHOUT_TORCH)
110109
list(APPEND LINK_LIBS torch)
111110
endif()
112111
set(link_libraries ${LINK_LIBS})

examples/models/llava/image_util.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,38 @@ def serialize_image(image: torch.Tensor, path: str) -> None:
5656
logging.info(f"Saved image tensor to {path}")
5757

5858

59+
def image_to_csv(image: torch.Tensor, path: str) -> None:
60+
im = torch.tensor(image)
61+
62+
# List of restrictions on the image tensor to ensure our naive csv converter works
63+
assert im.dim() == 3, "Image must be 3D"
64+
assert im.shape[0] == 3, "Image must have 3 channels"
65+
assert im.dtype == torch.uint8, "Image data must be uint8"
66+
assert im.device == torch.device("cpu"), "Image data must be on CPU"
67+
assert im.is_contiguous(), "Image data must be contiguous"
68+
assert im.dim_order() == (0, 1, 2), "Image data must be in CHW format"
69+
70+
# write header
71+
with open(path, "w") as f:
72+
# header
73+
# dims, shape[...], sizeof(dtype)
74+
f.write(f"{im.dim()},")
75+
for i in range(im.dim()):
76+
f.write(f"{im.shape[i]},")
77+
f.write("1\n") # sizeof(uint8_t), add a newline to end header
78+
79+
# append data as bytes
80+
with open(path, "ab") as f:
81+
# data as bytes
82+
for i in range(im.shape[0]):
83+
for j in range(im.shape[1]):
84+
for k in range(im.shape[2]):
85+
b = int(im[i][j][k]).to_bytes(1, byteorder="little")
86+
f.write(b)
87+
88+
logging.info(f"Saved image csv to {path}")
89+
90+
5991
def main():
6092
parser = ArgumentParser()
6193
parser.add_argument(
@@ -67,12 +99,19 @@ def main():
6799
"--output-path",
68100
default="image.pt",
69101
)
102+
parser.add_argument(
103+
"--output-csv",
104+
required=False,
105+
)
70106
args = parser.parse_args()
71107

72108
image = Image.open(args.image_path)
73109
image_tensor = prepare_image(image, target_h=336, target_w=336)
74110
serialize_image(image_tensor, args.output_path)
75111

112+
if args.output_csv:
113+
image_to_csv(image_tensor, args.output_csv)
114+
76115

77116
if __name__ == "__main__":
78117
main()

0 commit comments

Comments
 (0)