BibTeX
@inproceedings{10.1145/3757369.3767595,
author = {Cai, Zhuodi},
title = {How I Perceive It: Human Memory-Augmented Analogical Reasoning for Machine Visual Interpretation},
year = {2025},
isbn = {9798400721298},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3757369.3767595},
doi = {10.1145/3757369.3767595},
abstract = {How I Perceive It augments machine visual interpretation with human memory, shifting from superficial seeing to deep perceiving. Although today’s vision language models (VLMs) can generate image captions with a degree of subjectivity, they still struggle to explain the underlying reasons or experiential basis for such subjectivity. Machines can see, but they do not perceive as humans do, who link perception with prior experience and memory. To bridge this gap, this paper introduces a visual interpretation system that integrates individual memory into machine perception, founded on structure-mapping theory. By merging what the machine sees with what the individual remembers, the system produces individualized interpretations that uncover more insightful meanings among visual elements that are not immediately visible on the surface.},
booktitle = {Proceedings of the SIGGRAPH Asia 2025 Art Papers},
articleno = {12},
numpages = {8},
keywords = {Human-AI Collaboration, Vision-Language Models, VLMs, Cognitive AI, Explainable AI, Visual Interpretation, Memory, Analogical Reasoning},
location = {},
series = {SA Art Papers '25}
}
| Category     | Duration     | Tool / Material     |
|---|---|---|
| ML/AI Application; Art | 2025 @ University of Washington | Python; JavaScript |
How I Perceive It augments machine visual interpretation with human memory, shifting from superficial
seeing to
deep perceiving. Although today’s vision language models can generate image captions with a degree of
subjectivity, they still struggle to explain the experiential basis of such subjectivity. Machines can see, but
they do not perceive as humans do, who link perception with prior experience and memory. This work explores how
individual memory may participate in machine visual interpretation through analogical reasoning.
An uploaded image is first described by a vision language model. The generated caption is processed to remove
subjective descriptions and retain objective visual information, and the cleaned caption is decomposed into
structured visual elements including objects, attributes, and relations. These elements are used to retrieve
related memories from a personal archive constructed from the artist’s journal entries through keyword matching
and semantic similarity. Guided by the relations between visual elements, an analogical structure is constructed
between the uploaded image and the retrieved memories, generating an interpretation that merges what the machine
recognizes with what the individual remembers.
This interpretation process is further extended into an audiovisual artwork. Recognized objects from the image are
processed through edge detection and segmentation and used as conditions for generative images. Memories retrieved
from the archive guide the generation of new visual fragments that are recombined into a collage-like animation.
Glitch effects are adopted to reflect the fragmentation and uncertainty of human memory. The resulting visuals are
accompanied by synthesized sound and narration generated through a voice cloning model trained on the artist’s
voice.
Like Funes, as described by Jorge Luis Borges in Funes the Memorious, a machine may remember too much to
abstract meaningfully. Meaning may emerge not from what could be remembered, but from what must be forgotten.
Through entanglement, human and machine jointly decide what to discard and what to preserve, thereby constructing
meaning.
