RTOS SDK (license mode): Vision module integration-Alibaba Cloud Model Studio(Model Studio)-阿里云帮助中心

This topic explains how to use the RTOS SDK in license mode to implement vision capabilities, such as video calls and visual question answering.

1. Before you begin

1.1. Prerequisites

This topic builds on "Implement chat capabilities with the RTOS SDK (license mode)". Before continuing, ensure that you have read the topic and prepared the required environment.
This guide assumes you have already integrated the SDK and that the voice interaction workflow is running correctly.
This guide reuses some pseudocode and interaction flows from Implement chat capabilities with the RTOS SDK (license mode).

1.2. Configure and enable the vision module

In the Model Studio console, create a multimodal application.
After the application is created, the application card displays the application ID (which starts with mm_) and the license usage status. If the license status is "Not Purchased", click Purchase to buy a license. Click Configure App to open the application configuration page.
Click Configure App to enable the agent for Video Call, Visual Question Answering, Photo Translation, and/or Instant Video Call.
You can find this option in the Model Studio Applications list, under the Agent section on the Skill tab.

2. Develop the on-device vision module

The vision module features include visual question answering, video call, photo translation, and instant video call.

When not in a video call or an instant video call, any question similar to "What is in front of me?" triggers VQA.

When not in a video call or an instant video call, any question with a translation intent, such as "Translate the road sign in front of me," triggers photo translation.

When you are in a video call or an instant video call, the respective service handles all image Q&A logic.

2.1. SDK directory and file structure

After decompressing the SDK package, the relevant files and directories are structured as follows:

For information about how to obtain the SDK package, see RTOS C SDK (License Mode).

aliyun_sdk/
├── include
│   ├── c_utils
│   │   └── ...
│   ├── lib_c_mmi_vl.h
│   └── ...
├── libc_visual.a
└── ...

To enable the vision module, include the c_visual.h header file and link the libc_visual.a static library.

2.2. c_visual initialization

Sample code:

int32_t app_visual_init(void)
{
    c_visual_config_t config = {
        // Combine multiple values. For example, C_VISUAL_MODE_VQA | C_VISUAL_MODE_LIVE_AI.
        // Enables simultaneous use of VQA and LIVE AI.
        // C_VISUAL_MODE_NONE has the lowest priority and acts as a placeholder.
        .visual_mode = C_VISUAL_MODE_VQA | C_VISUAL_MODE_LIVE_AI | C_VISUAL_MODE_OMNI,
        // Image format. Currently unused. See the header file for supported formats.
        .image_format = C_VISUAL_PIC_FORMAT_JPG,
        // Required.
        .data_type = C_VISUAL_DATA_BASE64,
        // This parameter does not currently affect functionality.
        // Input image must be > 10x10 pixels with an aspect ratio between 1:200 and 200:1.
        .frame_size = C_VISUAL_FRAMESIZE_320x240,
        // Image size. Model Studio supports up to 180 KB per photo.
        .image_size = 180 * 1024,
        // Recommended: 2.
        .fps = 2,
        .event_callback = _visual_callback
    };
    c_visual_config(&config);
    return UTIL_SUCCESS;
}

The following logs are provided for your reference during initialization:

[UT][D][c_visual_config]mode 14
[UT][D][c_visual_config]VISUAL set params success
[UT][D][c_mmi_set_upstream_type]upstream_type[AudioAndVideo]
[UT][I][c_visual_config]upgrade to video stream
[UT][I][c_visual_config]malloc image buffer [184320]
[UT][I][util_malloc]ptr[0x150008000], size 184336
[UT][I][util_malloc]ptr[0x140008000], size 184336
[UT][D][util_double_buffer_init]buffer [0x150008008/0x140008008]
[UT][I][c_visual_config]create base64 buffer [245760]
[UT][I][util_malloc]ptr[0x140038000], size 245776
[UT][D][c_mm_cmd_register]create new domain [visual_qa]
[UT][D][c_mm_cmd_register]domain [visual_qa] add [visual_qa]
[UT][D][c_mm_cmd_register]create new domain [video_chat]
[UT][D][c_mm_cmd_register]domain [video_chat] add [open_videochat]
[UT][D][c_mm_cmd_register]domain [video_chat] add [switch_video_call_success]
[UT][D][c_mm_cmd_register]domain [video_chat] add [quit_videochat]
[UT][D][c_mm_cmd_register]domain [video_chat] add [exit_video_call_success]
[UT][D][c_mm_cmd_register]create new domain [omni]
[UT][D][c_mm_cmd_register]domain [omni] add [send_video_stream]
[UT][D][c_mm_cmd_register]domain [omni] add [stop_video_stream]
[UT][I][c_visual_config]done

2.3. c_visual events

enum {
    C_VISUAL_EVENT_VQA_START,       // Triggered when VQA starts. In this callback, open the camera and capture an image.
    C_VISUAL_EVENT_VQA_END,         // Triggered when VQA ends. In this callback, close the camera.
    C_VISUAL_EVENT_LIVEAI_START,    // Triggered when a video call (including an instant video call) starts. In this callback, open the camera.
    C_VISUAL_EVENT_LIVEAI_ACTION,   // Triggered for frame sampling during a video call. In this callback, perform image capture.
    C_VISUAL_EVENT_LIVEAI_STOP,     // Triggered when a video call ends. In this callback, close the camera.
};

2.3.1. Visual question answering and photo translation

The following code shows an example of how to handle events for visual question answering and photo translation:

static int32_t _visual_callback(uint32_t event, void* param)
{
    (void)param;
    switch (event) {
    case C_VISUAL_EVENT_VQA_START:
        UTIL_LOG_I("vqa start");
        c_camera_open();
        // Manually trigger a photo capture. Whether to call this function depends on your specific implementation.
        c_camera_capture();
        break; 
    case C_VISUAL_EVENT_VQA_END:
        UTIL_LOG_I("vqa end");
        c_camera_close();
        break;
    ...
    }
    return UTIL_SUCCESS;
}

2.3.2. Video call and instant video call

The following code shows an example of how to handle events for video calls and instant video calls:

static int32_t _visual_callback(uint32_t event, void* param)
{
    (void)param;
    switch (event) {
    case C_VISUAL_EVENT_LIVEAI_START:
        UTIL_LOG_I("liveai start");
        c_camera_open();
        break;
    case C_VISUAL_EVENT_LIVEAI_ACTION:
        UTIL_LOG_I("liveai capture");
        // Manually trigger a photo capture. Whether to call this function depends on your specific implementation.
        c_camera_capture();
        break;
    case C_VISUAL_EVENT_LIVEAI_STOP:
        UTIL_LOG_I("liveai stop");
        c_camera_close();
        break;
    ...
    }
    return UTIL_SUCCESS;
}

2.4. Camera implementation

2.4.1. Image capture logic

Use the following function sequence to pass the captured image to the c_visual module. The module then encapsulates and uploads the data as required by the vision model.

Sample Code 1: Passing captured data to c_visual via a camera callback.

// Pass the image data captured by the camera to c_visual
int32_t hal_camera_capture_callback(uint8_t *image, uint32_t image_size)
{
  uint8_t *buffer;
  uint32_t buffer_size;
  buffer = c_visual_image_get_buffer(&buffer_size);
  if (buffer == NULL) {
    UTIL_LOG_W("get buffer fail");
    return -1;
  }
  if (buffer_size < image_size) {
    UTIL_LOG_W("image_siz > buffer_size");
    return -1;
  }
  memcpy(buffer, image, image_size);
  c_visual_image_action(image_size);
  return 0;
}

Sample Code 2: Creating a thread to periodically send camera data.

void hal_camera_task_handle(void *param)
{
  (void)params;
  int32_t err;
  uint8_t *buffer;
  uint32_t buffer_size;
  while (1) {
    buffer = c_visual_image_get_buffer(&buffer_size);
    if (buffer == NULL) {
      UTIL_LOG_D("get buffer fail");
      util_msleep(20);
      continue;
    }
    err = hal_camera_capture(buffer, &buffer_size);
    if (err) {
      UTIL_LOG_D("capture fail");
      util_msleep(20);
      continue;
    }
    UTIL_LOG_I("capture success");
    c_visual_image_action(buffer_size);
    util_msleep(100);  // Sleep before the next capture.
  }
}

2.4.2. Implementation example

The following sample code creates a thread to periodically capture images from the camera. Choose an implementation method suitable for your hardware.

#include "hal_camera.h"
#include "c_visual.h"
#define CAMERA_DROP_NUM         (4)
typedef struct {
    uint8_t camera_initd;
    uint8_t camera_work;
    util_task_t* task;
    uint8_t task_running;
    uint8_t drop_img_num;
} _camera_info_t;
static _camera_info_t _camera_info = { 0 };
static void _camera_task_entry(void* params) {
    (void)params;
    int32_t err;
    uint8_t *buffer;
    uint32_t buffer_size;
    while (_camera_info.task_running) {
        if (_camera_info.camera_work == 0) {
            util_msleep(20);
            continue;
        }
        buffer = c_visual_image_get_buffer(&buffer_size);
        if (buffer == NULL) {
            UTIL_LOG_D("get buffer fail");
            util_msleep(20);
            continue;
        }
        err = hal_camera_capture(buffer, &buffer_size);
        if (err) {
            UTIL_LOG_D("capture fail");
            util_msleep(20);
            continue;
        }
        UTIL_LOG_I("capture success");
        if (_camera_info.drop_img_num) {
            UTIL_LOG_I("drop [%d]", _camera_info.drop_img_num);
            _camera_info.drop_img_num--;
            continue;
        }
        c_visual_image_action(buffer_size);
        c_visual_task_handle();
    }
    util_task_t* del_task = _camera_info.task;
    memset(&_camera_info, 0, sizeof(_camera_info));
    hal_camera_close();
    UTIL_LOG_I("task will exit");
    util_task_delete(del_task);
}
int32_t c_camera_init(void)
{
    int32_t err;
    if (_camera_info.camera_initd) {
        return UTIL_SUCCESS;
    }
    memset(&_camera_info, 0, sizeof(_camera_info));
    _camera_info.drop_img_num = CAMERA_DROP_NUM;
    hal_camera_open();
    _camera_info.task_running = 1;
    _camera_info.task = util_task_create("CAMERA", CAMERA_TASK_PRIORITY, CAMERA_STACK_SIZE, _camera_task_entry, NULL);
    _camera_info.camera_initd = 1;
    UTIL_LOG_I("done");
    return UTIL_SUCCESS;
}
int32_t c_camera_open(void)
{
    if (_camera_info.camera_initd == 0) {
        c_camera_init();
    }
    if (_camera_info.camera_work) {
        UTIL_LOG_D("already");
        return UTIL_SUCCESS;
    }
    c_visual_data_reset();
    hal_camera_open();
    _camera_info.drop_img_num = CAMERA_DROP_NUM;
    _camera_info.camera_work = 1;
    UTIL_LOG_I("done");
    return UTIL_SUCCESS;
}
int32_t c_camera_close(void)
{
    if (_camera_info.camera_initd == 0) {
        UTIL_LOG_E("not initialized");
        return UTIL_ERR_NO_INIT;
    }
    if (_camera_info.camera_work == 0) {
        UTIL_LOG_D("already");
        return UTIL_SUCCESS;
    }
    _camera_info.camera_work = 0;
    hal_camera_close();
    UTIL_LOG_I("done");
    return UTIL_SUCCESS;
}
int32_t c_camera_deinit(void)
{
    if (_camera_info.camera_initd == 0) {
        UTIL_LOG_D("already");
        return UTIL_SUCCESS;
    }
    _camera_info.task_running = 0;
    UTIL_LOG_I("prepare");
    return UTIL_SUCCESS;
}