[Llava] Add csv image loading in C++ runner

digantdesai · digantdesai · commit 087412323c9d · 2024-09-17T23:19:39.000-07:00
Rationale - We don't want to depend on Torch when building for Android.

This add two things,
(1) for AoT, python image_util optionally generates a .csv from a jpg. In
addition to .pt.

(2) add a runtime runner flag which hints at the provided image is a
csv. And if so, the runner parses the csv and feeds it to the model.

This is very naive and obviously fragile. Added some checks in python.

Tested few ways,
- On M1, with torch, loaded both .pt and .csv generated from the same
  jpg. And the LLM produces same text.
- On Android, without torch, loaded .csv and it also produces similar
  text.:wq
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
@@ -25,6 +25,10 @@ if [[ "${TARGET_OS_lower}" == "android" ]]; then
         exit 1
     fi
 fi
+PTE=llava.pte
+IMAGE_PT=image.pt
+IMAGE_CSV=image.csv
+TOKENIZER=tokenizer.bin
 
 # Number of processes for a parallel build
 NPROC=8
@@ -96,7 +100,7 @@ cmake_build_llava_runner_for_android() {
         -DANDROID_PLATFORM=android-23                                           \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
         -DCMAKE_PREFIX_PATH="$python_lib"                                       \
-        -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
+        -DBUILD_LLAVA_RUNNER_WITHOUT_TORCH=ON                                   \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
@@ -106,67 +110,87 @@ cmake_build_llava_runner_for_android() {
 # only export the one without custom op for now since it's
 export_llava() {
     echo "Starting to export Llava. This will take about 6 mins"
-    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name ${PTE} --with-artifacts
 }
 
 # Download a new image with different size, to test if the model can handle different image sizes
 prepare_image_tensor() {
     echo "Downloading image"
     curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
-    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path ${IMAGE_PT} --output-csv ${IMAGE_CSV}
 }
 
 run_and_verify() {
     NOW=$(date +"%H:%M:%S")
     echo "Starting to run llava runner at ${NOW}"
-    if [[ ! -f "llava.pte" ]]; then
-        echo "Export failed. Abort"
+    if [[ ! -f "${PTE}" ]]; then
+        echo "Could not file PTE: ${PTE}. Export failed. Aborting."
         exit 1
     fi
-    if [[ ! -f "image.pt" ]]; then
-        echo "image.pt is missing."
+    if [[ ! -f "${IMAGE_PT}" ]]; then
+        echo ".pt Image file: ${IMAGE_PT} is missing."
         exit 1
     fi
-    if [[ ! -f "tokenizer.bin" ]]; then
-        echo "tokenizer.bin is missing."
+    if [[ ! -f "${IMAGE_CSV}" ]]; then
+        echo "csv Image file ${IMAGE_CSV} is missing."
+        exit 1
+    fi
+    if [[ ! -f "${TOKENIZER}" ]]; then
+        echo "Tokenizer: ${TOKENIZER} is missing."
         exit 1
     fi
 
-
-
-    RUNTIME_ARGS="--model_path=llava.pte    \
-        --tokenizer_path=tokenizer.bin      \
-        --image_path=image.pt               \
-        --prompt=ASSISTANT:                 \
-        --temperature=0                     \
+    RUNTIME_ARGS="--model_path=${PTE} \
+        --tokenizer_path=${TOKENIZER} \
+        --prompt=ASSISTANT:           \
+        --temperature=0               \
         --seq_len=650"
 
     if [[ "${TARGET_OS_lower}" == "android" ]]; then
-        echo "Transfer relevant files to the phone via ADB and run llava_main with following args,"
-        echo "$ llava_main ${RUNTIME_ARGS} "
+        echo "Transfer relevant files to the phone via ADB and run llava_main binary."
         exit 0;
     fi
 
-    ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
+    ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} \
+        --image_path=${IMAGE_PT} > result.txt
+
+    ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} \
+        --image_path=${IMAGE_CSV} --is_csv_image=true > result_csv.txt
 
     # verify result.txt
     RESULT=$(cat result.txt)
+    RESULT_CSV=$(cat result_csv.txt)
     # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
     if [[ "$(uname)" == "Darwin" ]]; then
         EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
+        # TODO: This is what it produces on M1.
+        # EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with a group of men playing on a basketball court. There are at least nine players on the court, each actively"
     else
         # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
         EXPECTED_PREFIX="ASSISTANT:"
     fi
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
         echo "Actual result: ${RESULT}"
+        echo "Actual result (csv): ${RESULT_CSV}"
         echo "Success"
-        exit 0
     else
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
         echo "Actual result: ${RESULT}"
-        echo "Failure; results not the same"
+        echo "Actual result (csv): ${RESULT_CSV}"
+        echo "Failure; generated text is not as expected"
+        exit 1
+    fi
+
+    if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
+        echo ".pt and .csv image produced same output"
+        echo "Success"
+        exit 0
+    else
+        echo ".pt and .csv image inputs produced different outputs"
+        echo ".pt image produced: ${RESULT}"
+        echo ".csv image produced: ${RESULT_CSV}"
+        echo "Failure"
         exit 1
     fi
 }
@@ -187,4 +211,7 @@ export_llava
 
 # Step3. Run
 prepare_image_tensor
-run_and_verify
+# Only run if building for native
+if [[ "${TARGET_OS_lower}" == "native" ]]; then
+    run_and_verify
+fi
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
@@ -22,7 +22,7 @@ project(llava)
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
 # This is a temporary hack to get around Torch dep so we can test this on android
-option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)
+option(BUILD_LLAVA_RUNNER_WITHOUT_TORCH "Build Llava runner without torch dep" OFF)
 
 include(CMakeDependentOption)
 #
@@ -74,10 +74,9 @@ set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
 # Avoid torch dep from torch.load()-ing the image.
-# This is a temporary hack.
-if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
-  add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
-  message("Buidling the runner without Torch, feeding a dummy image!")
+# Use a CSV formatted image instead
+if(BUILD_LLAVA_RUNNER_WITHOUT_TORCH)
+  add_definitions(-DBUILD_LLAVA_RUNNER_WITHOUT_TORCH=1)
 else()
   find_package(Torch CONFIG REQUIRED)
 endif()
@@ -106,7 +105,7 @@ endif()
 add_subdirectory(runner)
 
 set(LINK_LIBS gflags)
-if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
+if(NOT BUILD_LLAVA_RUNNER_WITHOUT_TORCH)
   list(APPEND LINK_LIBS torch)
 endif()
 set(link_libraries ${LINK_LIBS})
diff --git a/examples/models/llava/image_util.py b/examples/models/llava/image_util.py
@@ -56,6 +56,38 @@ def serialize_image(image: torch.Tensor, path: str) -> None:
     logging.info(f"Saved image tensor to {path}")
 
 
+def image_to_csv(image: torch.Tensor, path: str) -> None:
+    im = torch.tensor(image)
+
+    # List of restrictions on the image tensor to ensure our naive csv converter works
+    assert im.dim() == 3, "Image must be 3D"
+    assert im.shape[0] == 3, "Image must have 3 channels"
+    assert im.dtype == torch.uint8, "Image data must be uint8"
+    assert im.device == torch.device("cpu"), "Image data must be on CPU"
+    assert im.is_contiguous(), "Image data must be contiguous"
+    assert im.dim_order() == (0, 1, 2), "Image data must be in CHW format"
+
+    # write header
+    with open(path, "w") as f:
+        # header
+        # dims, shape[...], sizeof(dtype)
+        f.write(f"{im.dim()},")
+        for i in range(im.dim()):
+            f.write(f"{im.shape[i]},")
+        f.write("1\n")  # sizeof(uint8_t), add a newline to end header
+
+    # append data as bytes
+    with open(path, "ab") as f:
+        # data as bytes
+        for i in range(im.shape[0]):
+            for j in range(im.shape[1]):
+                for k in range(im.shape[2]):
+                    b = int(im[i][j][k]).to_bytes(1, byteorder="little")
+                    f.write(b)
+
+    logging.info(f"Saved image csv to {path}")
+
+
 def main():
     parser = ArgumentParser()
     parser.add_argument(
@@ -67,12 +99,19 @@ def main():
         "--output-path",
         default="image.pt",
     )
+    parser.add_argument(
+        "--output-csv",
+        required=False,
+    )
     args = parser.parse_args()
 
     image = Image.open(args.image_path)
     image_tensor = prepare_image(image, target_h=336, target_w=336)
     serialize_image(image_tensor, args.output_path)
 
+    if args.output_csv:
+        image_to_csv(image_tensor, args.output_csv)
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp