[
    {
        "id": "386",
        "title": "Synthesizing Composite Hierarchical Structure from Symbolic Music Corpora",
        "authors": "Ilana Shapiro, Ruanqianqian (Lisa) Huang, Zachary Novack, Cheng-i Wang, Hao-Wen Dong, Taylor Berg-Kirkpatrick, Shlomo Dubnov, Sorin Lerner",
        "abstract": "Western music is an innately hierarchical system of interacting levels of structure, from fine-grained melody to high-level form. In order to analyze music compositions holistically and at multiple granularities, we propose a unified, hierarchical meta-representation of musical structure called the structural temporal graph (STG). For a single piece, the STG is a data structure that defines a hierarchy of progressively finer structural musical features and the temporal relationships between them. We use the STG to enable a novel approach for deriving a representative structural summary of a music corpus, which we formalize as a dually NP-hard combinatorial optimization problem. Our approach first applies simulated annealing to develop a measure of structural distance between two music pieces rooted in graph isomorphism. Our approach then combines the formal guarantees of SMT solvers with nested simulated annealing over structural distances to produce a structurally sound, representative centroid STG for an entire corpus of STGs from individual pieces. To evaluate our approach, we conduct experiments verifying that structural distance accurately differentiates between music pieces, and that derived centroids accurately structurally characterize their corpora.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    },
    {
        "id": "4000",
        "title": "Hallucination-Aware Prompt Optimization for Text-to-Video Synthesis",
        "authors": "Jiapeng Wang, Chengyu Wang, Jun Huang, Lianwen Jin",
        "abstract": "The rapid advancements in AI-generated content (AIGC) have led to extensive research and application of deep text-to-video (T2V) synthesis models, such as OpenAI's Sora. These models typically rely on high-quality prompt-video pairs and detailed text prompts for model training in order to produce high-quality videos. To boost the effectiveness of Sora-like T2V models, we introduce VidPrompter, an innovative large multi-modal model supporting T2V applications with three key functionalities: (1) generating detailed prompts from raw videos, (2) enhancing prompts from videos grounded with short descriptions, and (3) refining simple user-provided prompts to elevate T2V video quality. We train VidPrompter using a hybrid multi-task paradigm and propose the hallucination-aware direct preference optimization (HDPO) technique to improve the multi-modal, multi-task prompt optimization process. Experiments on various tasks show our method surpasses strong baselines and other competitors.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "2855",
        "title": "SynthRL: Cross-domain Synthesizer Sound Matching via Reinforcement Learning",
        "authors": "Wonchul Shin, Kyogu Lee",
        "abstract": "Generalization of synthesizer sound matching to external instrument sounds is highly challenging due to the non-differentiability of sound synthesis process which prohibits the use of out-of-domain sounds for training with synthesis parameter loss. We propose SynthRL, a novel reinforcement learning (RL)-based approach for cross-domain synthesizer sound matching. By incorporating sound similarity into the reward function, SynthRL effectively optimizes synthesis parameters without ground-truth labels, allowing fine-tuning on out-of-domain sounds. Furthermore, we introduce a transformer-based model architecture and reward-based prioritized experience replay to enhance RL training efficiency, considering the unique characteristics of the task. Experimental results demonstrate that SynthRL outperforms state-of-the-art methods on both in-domain and out-of-domain tasks. Further experimental analysis validates the effectiveness of our reward design, showing a strong correlation with human perception of sound similarity.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    },
    {
        "id": "8335",
        "title": "GETMusic: Generating Music Tracks with a Unified Representation and Diffusion Framework",
        "authors": "Ang Lv, Xu Tan, Peiling Lu, Wei Ye, Shikun Zhang, Jiang Bian, Rui Yan",
        "abstract": "Symbolic music generation aims to create musical notes, which can help users compose music, such as generating target instrument tracks based on provided source tracks. In practical scenarios where there’s a predefined ensemble of tracks and various composition needs, an efficient and effective generative model that can generate any target tracks based on the other tracks becomes crucial. However, previous efforts have fallen short in addressing this necessity due to limitations in their music representations and models. In this paper, we introduce a framework known as GETMusic, with ``GET'' standing for ``GEnerate music Tracks.'' This framework encompasses a novel music representation ``GETScore'' and a diffusion model ``GETDiff.'' GETScore represents musical notes as tokens and organizes tokens in a 2D structure, with tracks stacked vertically and progressing horizontally over time. At a training step, each track of a music piece is randomly selected as either the target or source. The training involves two processes: In the forward process, target tracks are corrupted by masking their tokens, while source tracks remain as the ground truth; in the denoising process, GETDiff is trained to predict the masked target tokens conditioning on the source tracks. Our proposed representation, coupled with the non-autoregressive generative model, empowers GETMusic to generate music with any arbitrary source-target track combinations.Our experiments demonstrate that the versatile GETMusic outperforms prior works proposed for certain specific composition tasks.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "9182",
        "title": "AI-Assisted Human-Pet Artistic Musical Co-Creation for Wellness Therapy",
        "authors": "Zihao Wang, Le Ma, Yuhang Jin, Yongsheng Feng, Xin Pan, Shulei Ji, Kejun Zhang",
        "abstract": "This paper explores AI-mediated human-pet musical co-creation from an interdisciplinary perspective, leveraging recent advancements in animal-assisted therapy. These advancements have shown significant psychosocial benefits, especially in reducing anxiety and enhancing social engagement. Building on these findings, this study innovatively employs pet vocal timbres as 'digital avatars' to enhance emotional investment during the music creation process. We propose PetCoCre, a novel system that applies pet vocal timbres in three distinct character paradigms within AI music creation: (1) PetRhythm: using pet voices as rhythmic percussion through beat synchronization. (2) PetMelody: enabling pet voices to act as melodic instruments via pitch-shifting alignment. (3) PetVocalia: utilizing pet vocal timbres as the target timbre for SVC (Singing Voice Conversion), where the converted singing voice replaces the original singer's voice, thus preserving the original semantic content.\r\nBeyond these character paradigms, our technical innovation lies in proposing SaMoye, the first open-source, high-quality zero-shot SVC model that effectively overcomes existing methods' zero-shot limitations by employing mixed speaker embeddings for timbre enhancement and leveraging a large-scale singing voice dataset.\r\nIn our experiments, we collected dog and cat vocalization data from pet stores and conducted experiments with 30 participants. Results demonstrate that the human-pet co-creation mode led to significant enhancements in pleasure and creative satisfaction compared to solo AI music generation, along with a significant reduction in participants' anxiety levels.\r\nThrough collaborative art creation, this research pioneers new paradigms for animal-assisted therapeutic interventions and expands the boundaries of AI-assisted creative collaboration.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8391",
        "title": "MagicTailor: Component-Controllable Personalization in Text-to-Image Diffusion Models",
        "authors": "Donghao Zhou, Jiancheng Huang, Jinbin Bai, Jiaze Wang, Hao Chen, Guangyong Chen, Xiaowei Hu, Pheng-Ann Heng",
        "abstract": "Text-to-image diffusion models can generate high-quality images but lack fine-grained control of visual concepts, limiting their creativity. Thus, we introduce component-controllable personalization, a new task that enables users to customize and reconfigure individual components within concepts. This task faces two challenges: semantic pollution, where undesired elements disrupt the target concept, and semantic imbalance, which causes disproportionate learning of the target concept and component. To address these, we design MagicTailor, a framework that uses Dynamic Masked Degradation to adaptively perturb unwanted visual semantics and Dual-Stream Balancing for more balanced learning of desired visual semantics. The experimental results show that MagicTailor achieves superior performance in this task and enables more personalized and creative image generation.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "1477",
        "title": "QA-MDT: Quality-aware Masked Diffusion Transformer for Enhanced Music Generation",
        "authors": "Chang Li, Ruoyu Wang, Lijuan Liu, Jun Du, Yixuan Sun, Zilu Guo, Zhengrong Zhang, Yuan Jiang, Jianqing Gao, Feng Ma",
        "abstract": "Text-to-music (TTM) generation, which converts textual descriptions into audio, opens up innovative avenues for multimedia creation.\r\nAchieving high quality and diversity in this process demands extensive, high-quality data, which are often scarce in available datasets. Most open-source datasets frequently suffer from issues like low-quality waveforms and low text-audio consistency, hindering the advancement of music generation models.\r\nTo address these challenges, we propose a novel quality-aware training paradigm for generating high-quality, high-musicality music from large-scale, quality-imbalanced datasets. Additionally, by leveraging unique properties in the latent space of musical signals, we adapt and implement a masked diffusion transformer (MDT) model for the TTM task, showcasing its capacity for quality control and enhanced musicality. Furthermore, we introduce a three-stage caption refinement approach to address low-quality captions' issue. Experiments show state-of-the-art (SOTA) performance on benchmark datasets including MusicCaps and the Song-Describer Dataset with both objective and subjective metrics.\r\nDemo audio samples are available at https:\/\/qa-mdt.github.io\/, code and pretrained checkpoints are open-sourced at https:\/\/github.com\/ivcylc\/OpenMusic.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8427",
        "title": "FancyVideo: Towards Dynamic and Consistent Video Generation via Cross-frame Textual Guidance",
        "authors": "Jiasong Feng, Ao Ma, Jing Wang, Ke Cao, Zhanjie Zhang",
        "abstract": "Synthesizing motion-rich and temporally consistent videos remains a challenge in artificial intelligence, especially when dealing with extended durations. Existing text-to-video (T2V) models commonly employ spatial cross-attention for text control, equivalently guiding different frame generations without frame-specific textual guidance. Thus, the model's capacity to comprehend the temporal logic conveyed in prompts and generate videos with coherent motion is restricted. To tackle this limitation, we introduce FancyVideo, an innovative video generator that improves the existing text-control mechanism with the well-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM incorporates the Temporal Information Injector (TII) and Temporal Affinity Refiner (TAR) at the beginning and end of cross-attention, respectively, to achieve frame-specific textual guidance. Firstly, TII injects frame-specific information from latent features into text conditions, thereby obtaining cross-frame textual conditions. Then, TAR refines the correlation matrix between cross-frame textual conditions and latent features along the time dimension.  Extensive experiments comprising both quantitative and qualitative evaluations demonstrate the effectiveness of FancyVideo. Our approach achieves state-of-the-art T2V generation results on the EvalCrafter benchmark and facilitates the synthesis of dynamic and consistent videos. Note that the T2V process of FancyVideo essentially involves a text-to-image step followed by T+I2V. This means it also supports the generation of videos from user images, i.e., the image-to-video (I2V) task. A significant number of experiments have shown that its performance is also outstanding.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8503",
        "title": "METEOR: Melody-aware Texture-controllable Symbolic Music Re-Orchestration via Transformer VAE",
        "authors": "Dinh-Viet-Toan Le, Yi-Hsuan Yang",
        "abstract": "Re-orchestration is the process of adapting a music piece for a different set of instruments. By altering the original instrumentation, the orchestrator often modifies the musical texture while preserving a recognizable melodic line and ensures that each part is playable within the technical and expressive capabilities of the chosen instruments.\r\n    \r\nIn this work, we propose METEOR, a model for generating Melody-aware Texture-controllable re-Orchestration with a Transformer-based variational auto-encoder (VAE). This model performs symbolic instrumental and textural music style transfers with a focus on melodic fidelity and controllability. We allow bar- and track-level controllability of the accompaniment with various textural attributes while keeping a homophonic texture. With both subjective and objective evaluations, we show that our model outperforms style transfer models on a re-orchestration task in terms of generation quality and controllability. Moreover, it can be adapted for a lead sheet orchestration task as a zero-shot learning model, achieving performance comparable to a model specifically trained for this task.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    },
    {
        "id": "9025",
        "title": "A Picture is Worth a Thousand Prompts? Efficacy of Iterative Human-Driven Prompt Refinement in Image Regeneration Tasks",
        "authors": "Khoi Trinh, Scott Seidenberger, Raveen Wijewickrama, Murtuza Jadliwala, Anindya Maiti",
        "abstract": "With AI-generated content becoming widespread across digital platforms, it is important to understand how such content is inspired and produced. This study explores the underexamined task of image regeneration, where a human operator iteratively refines prompts to recreate a specific target image. Unlike typical image generation, regeneration begins with a visual reference. A key challenge is whether existing image similarity metrics (ISMs) align with human judgments and can serve as useful feedback in this process. We conduct a structured user study to evaluate how iterative prompt refinement affects similarity to target images and whether ISMs reflect the improvements perceived by human observers. Our results show that prompt adjustments significantly improve alignment, both subjectively and quantitatively, highlighting the potential of iterative workflows in enhancing generative image quality.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "10:00",
        "session": "AI and Arts (1\/2)",
        "poster_positions": "From board n88 to board n92"
    },
    {
        "id": "8369",
        "title": "ExVideo: Extending Video Diffusion Models via Parameter-Efficient Post-Tuning",
        "authors": "Zhongjie Duan, Hong Zhang, Wenmeng Zhou, Cen Chen, Yaliang Li, Yu Zhang, Yingda Chen",
        "abstract": "Recently, advancements in video synthesis have attracted significant attention. Video synthesis models have demonstrated the practical applicability of diffusion models in creating dynamic visual content. Despite these advancements, the extension of video lengths remains constrained by computational resources. Most existing video synthesis models are limited to generating short video clips. In this paper, we propose a novel post-tuning methodology for video synthesis models, called ExVideo. This approach is designed to enhance the capability of current video synthesis models, allowing them to produce content over extended temporal durations while incurring lower training expenditures. In particular, we design extension strategies across common temporal model architectures respectively, including 3D convolution, temporal attention, and positional embedding. To evaluate the efficacy of our proposed post-tuning approach, we trained ExSVD, an extended model based on Stable Video Diffusion model. Our approach enhances the model's capacity to generate up to 5x its original number of frames, requiring only 1.5k GPU hours of training on a dataset comprising 40k videos. Importantly, the substantial increase in video length doesn't compromise the model's innate generalization capabilities, and the model showcases its advantages in generating videos of diverse styles and resolutions. We have released the source code and the enhanced model publicly.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8565",
        "title": "AdaptEdit: An Adaptive Correspondence Guidance Framework for Reference-Based Video Editing",
        "authors": "Tongtong Su, Chengyu Wang, Bingyan Liu, Jun Huang, Dongming Lu",
        "abstract": "Video editing is a pivotal process for customizing video content according to user needs. However, existing text-guided methods often lead to ambiguities regarding user intentions and restrict fine-grained control for editing specific aspects in videos. To overcome these limitations, this paper introduces a novel approach named \\emph{AdaptEdit}, which focuses on reference-based video editing that disentangles the editing process. It achieves this by first editing a reference image and then adaptively propagating its appearance across other frames to complete the video editing. While previous propagation methods, such as optical flow and the temporal modules of recent video generative models, struggle with object deformations and large motions, we propose an adaptive correspondence strategy that accurately transfers the appearance from the reference frame to the target frames by leveraging inter-frame semantic correspondences in the original video. By implementing a proxy-editing task to optimize hyperparameters for image token-level correspondence, our method effectively balances the need to maintain the target frame's structure while preventing leakage of irrelevant appearance. To more accurately evaluate editing beyond the semantic-level consistency provided by CLIP-style models, we introduce a new dataset, PVA, which supports pixel-level evaluation. Our method outperforms the best-performing baseline with a clear PSNR improvement of 3.6 dB.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8615",
        "title": "NotaGen: Advancing Musicality in Symbolic Music Generation with Large Language Model Training Paradigms",
        "authors": "Yashan Wang, Shangda Wu, Jianhuai Hu, Xingjian Du, Yueqi Peng, Yongxin Huang, Shuai Fan, Xiaobing Li, Feng Yu, Maosong Sun",
        "abstract": "We introduce NotaGen, a symbolic music generation model aiming to explore the potential of producing high-quality classical sheet music. Inspired by the success of Large Language Models (LLMs), NotaGen adopts pre-training, fine-tuning, and reinforcement learning paradigms (henceforth referred to as the LLM training paradigms). It is pre-trained on 1.6M pieces of music in ABC notation, and then fine-tuned on approximately 9K high-quality classical compositions conditioned on \"period-composer-instrumentation\"  prompts. For reinforcement learning, we propose the CLaMP-DPO method, which further enhances generation quality and controllability without requiring human annotations or predefined rewards. Our experiments demonstrate the efficacy of CLaMP-DPO in symbolic music generation models with different architectures and encoding schemes. Furthermore, subjective A\/B tests show that NotaGen outperforms baseline models against human compositions, greatly advancing musical aesthetics in symbolic music generation.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "10:00",
        "session": "AI and Arts (1\/2)",
        "poster_positions": "From board n88 to board n92"
    },
    {
        "id": "8370",
        "title": "FastBlend: Enhancing Video Stylization Consistency via Model-Free Patch Blending",
        "authors": "Zhongjie Duan, Chengyu Wang, Cen Chen, Weining Qian, Jun Huang, Mingyi Jin",
        "abstract": "With the emergence of diffusion models and the rapid development of image processing, generating artistic images in style transfer tasks has become effortless. However, these impressive image processing approaches face consistency issues in video processing due to the independent processing of each frame. In this paper, we propose a powerful, model-free approach called FastBlend to address the consistency problem in video stylization. FastBlend functions as a post-processor and can be seamlessly integrated with diffusion models to create a robust video stylization pipeline. Based on a patch-matching algorithm, we remap and blend the aligned content across multiple frames, thus compensating for inconsistent content with neighboring frames. Moreover, we propose a tree-like data structure and a specialized loss function, aiming to optimize computational efficiency and visual quality for different application scenarios. Extensive experiments have demonstrated the effectiveness of FastBlend. Compared with both independent video deflickering algorithms and diffusion-based video processing methods, FastBlend is capable of synthesizing more coherent and realistic videos.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "9218",
        "title": "Pay Attention to the Keys: Visual Piano Transcription Using Transformers",
        "authors": "Uros Zivanovic, Ivan Pilkov, Carlos Cancino-Chacón",
        "abstract": "Visual piano transcription (VPT) is the task of obtaining a symbolic representation of a piano performance from visual information alone (e.g., from a top-down video of the piano keyboard). In this work we propose a VPT system based on the vision transformer (ViT), which surpasses previous methods based on convolutional neural networks (CNNs). Our system is trained on the newly introduced R3 dataset, consisting of ca.~31 hours of synchronized video and MIDI recordings of piano performances. We additionally introduce an approach to predict note offsets, which has not been previously explored in this context. We show that our system outperforms the state-of-the-art on the PianoYT dataset for onset prediction and on the R3 dataset for both onsets and offsets.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    },
    {
        "id": "8700",
        "title": "Intoner: For Chinese Poetry Intoning Synthesis",
        "authors": "Heda Zuo, Liyao Sun, Zeyu Lai, Weitao You, Pei Chen, Lingyun Sun",
        "abstract": "Chinese Poetry Intoning, with improvised melodies devoid of fixed musical scores, is crucial for emotional expression and prosodic rendition. However, this cultural heritage faces challenges in propagation due to scant audio records and a scarcity of domain experts. Existing text-to-speech models lack the ability to generate melodious audio, while singing-voice-synthesis models rely on predetermined musical scores, which are all unsuitable for intoning synthesis. Hence, we introduce Chinese Poetry Intoning Synthesis (PIS) as a novel task to reproduce intoning audio and preserve this age-old cultural art. Corresponding to this task, we summarize three-level principles from poetry metrical patterns and construct a diffusion PIS model Intoner based on them. We also collect a multi-style Chinese poetry intoning dataset of text-audio pairs accompanied by feature annotations. Experimental results show that our model effectively learns diverse intoning styles and contents which can synthesize more melodious and vibrant intoning audio. To the best of our knowledge, we are the first to work on poetry intoning synthesis task.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8437",
        "title": "Precarity and Solidarity: Preliminary Results on a Study of Queer and Disabled Fiction Writers’ Experiences with Generative AI",
        "authors": "Carolyn Lamb, Daniel G. Brown, Maura R. Grossman",
        "abstract": "We present a mixed-methods study of professional fiction writers' experiences with generative AI (genAI), primarily focused on queer and disabled writers. Queer and disabled writers are markedly more pessimistic than others about the impact of genAI on their industry, although pessimism is the majority attitude for all groups. We explore how genAI exacerbates existing causes of precarity for writers, reasons why writers are opposed to its use, and strategies used by marginalized fiction writers to safeguard their industry.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "10:00",
        "session": "AI and Arts (1\/2)",
        "poster_positions": "From board n88 to board n92"
    },
    {
        "id": "8504",
        "title": "Scan-and-Print: Patch-level Data Summarization and Augmentation for Content-aware Layout Generation in Poster Design",
        "authors": "HsiaoYuan Hsu, Yuxin Peng",
        "abstract": "In AI-empowered poster design, content-aware layout generation is crucial for the on-image arrangement of visual-textual elements, e.g., logo, text, and underlay. To perceive the background images, existing work demanded a high parameter count that far exceeds the size of available training data, which has impeded the model's real-time performance and generalization ability. To address these challenges, we proposed a patch-level data summarization and augmentation approach, vividly named Scan-and-Print. Specifically, the scan procedure selects only the patches suitable for placing element vertices to perform fine-grained perception efficiently. Then, the print procedure mixes up the patches and vertices across two image-layout pairs to synthesize over 100% new samples in each epoch while preserving their plausibility. Besides, to facilitate the vertex-level operations, a vertex-based layout representation is introduced. Extensive experimental results on widely used benchmarks demonstrated that Scan-and-Print can generate visually appealing layouts with state-of-the-art quality while dramatically reducing computational bottleneck by 95.2%. The project page is at https:\/\/thekinsley.github.io\/Scan-and-Print\/.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8860",
        "title": "Towards a Practical Tool for Music Composition: Using Constraint Programming to Model Chord Progressions and Modulations",
        "authors": "Damien Sprockeels, Peter Van Roy",
        "abstract": "The Harmoniser project aims to provide a practical tool to aid music composers in creating complete musical works. In this paper, we present a formal model of its second layer, tonal chord progressions and modulations to neighbouring tonalities, and a practical implementation using the Gecode constraint solver. Since music composition is too complex to formalize in its entirety, the Harmoniser project makes two assumptions for tractability: first, it focuses on tonal music (the basis of Western classical and popular music); second, it defines a simplified four-layer composition process that is relevant for a significant number of composers. Previous work on using constraint programming for music composition was limited to exploring the formalisation of different musical aspects and did not address the overall problem of building a practical composer tool. Harmoniser's four layers are global structure (tonal development of the whole piece), chord progressions (diatonic and chromatic) and modulations, voicing (four-voice chord layout), and ornaments (e.g., passing notes, appoggiaturas), all allowing iterative refinement by the composer. This paper builds on prior work for voicing layer 3, Diatony, and presents a model for layer 2, chord progressions and modulations. The results of the present paper can be used as input to Diatony to generate voicing. Future work will define models for the remaining layers, and combine all layers together with a graphical user interface as a plug-in for a DAW.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    },
    {
        "id": "8393",
        "title": "Large Language Model Meets Constraint Propagation",
        "authors": "Alexandre Bonlarron, Florian Régin, Elisabetta De Maria, Jean-Charles Régin",
        "abstract": "Large Language Models (LLMs) excel at generating fluent text but struggle to enforce external constraints because they generate tokens sequentially without explicit control mechanisms. GenCP addresses this limitation by combining LLM predictions with Constraint Programming (CP) reasoning, formulating text generation as a Constraint Satisfaction Problem (CSP). In this paper, we improve GenCP by integrating Masked Language Models (MLMs) for domain generation, which allows bidirectional constraint propagation that leverages both past and future tokens. This integration bridges the gap between token-level prediction and structured constraint enforcement, leading to more reliable and constraint-aware text generation. Our evaluation on COLLIE benchmarks demonstrates that incorporating domain preview via MLM calls significantly improves GenCP's performance. Although this approach incurs additional MLM calls and, in some cases, increased backtracking, the overall effect is a more efficient use of LLM inferences and an enhanced ability to generate feasible and meaningful solutions, particularly in tasks with strict content constraints.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "10:00",
        "session": "AI and Arts (1\/2)",
        "poster_positions": "From board n88 to board n92"
    },
    {
        "id": "8971",
        "title": "Algorithmic Composition Using Narrative Structure and Tension",
        "authors": "Francisco Braga, Gilberto Bernardes, Roger B. Dannenberg, Nuno Correia",
        "abstract": "This paper describes an approach to algorithmic music composition that takes narrative structures as input, allowing composers to create music directly from narrative elements.\r\nCreating narrative development in music remains a challenging task in algorithmic composition.\r\nOur system addresses this by combining leitmotifs to represent characters, generative grammars for harmonic coherence, and evolutionary algorithms to align musical tension with narrative progression.\r\nThe system operates at different scales, from overall plot structure to individual motifs, enabling both autonomous composition and co-creation with varying degrees of user control.\r\nEvaluation with compositions based on tales demonstrated the system's ability to compose music that supports narrative listening and aligns with its source narratives, while being perceived as familiar and enjoyable.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    },
    {
        "id": "8738",
        "title": "A³-Net: Calibration-Free Multi-View 3D Hand Reconstruction for Enhanced Musical Instrument Learning",
        "authors": "Geng Chen, Xufeng Jian, Yuchen Chen, Pengfei Ren, Jingyu Wang, Haifeng Sun, Qi Qi, Jing Wang, Jianxin Liao",
        "abstract": "Precise 3D hand posture is essential for learning musical instruments. Reconstructing highly precise 3D hand gestures enables learners to correct and master proper techniques through 3D simulation and Extended Reality. However, exsiting methods typically rely on precisely calibrated multi-camera systems, which are not easily deployable in everyday environments. In this paper, we focus on calibration-free multi-view 3D hand reconstruction in unconstrained scenarios. Establishing correspondences between multi-view images is particularly challenging without camera extrinsics. To address this, we propose A^3-Net, a multi-level alignment framework that utilizes 3D structural representations with hierarchical geometric and explicit semantic information as alignment proxies, facilitating multi-view feature interaction in both 3D geometric space and 2D visual space. Specifically, we first perfrom global geometric alignment to map multi-view features into a canonical space. Subsequently, we aggregate information into predefined sparse and dense proxies to further integrate cross-view semantics through mutual interaction. Finnaly, we perfrom 2D alignment to align projected 2D visual features with 2D observations. Our method achieves state-of-the-art results in the multi-view 3D hand reconstruction task, demonstrating the effectiveness of our proposed framework.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8876",
        "title": "Weakly-Supervised Movie Trailer Generation Driven by Multi-Modal Semantic Consistency",
        "authors": "Sidan Zhu, Yutong Wang, Hongteng Xu, Dixin Luo",
        "abstract": "As an essential movie promotional tool, trailers are designed to capture the audience's interest through the skillful editing of key movie shots. Although some attempts have been made for automatic trailer generation, existing methods often rely on predefined rules or manual fine-grained annotations and fail to fully leverage the multi-modal information of movies, resulting in unsatisfactory trailer generation results. In this study, we introduce a weakly-supervised trailer generation method driven by multi-modal semantic consistency. Specifically, we design a multi-modal trailer generation framework that selects and sorts key movie shots based on input music and movie metadata (e.g., category tags and plot keywords) and adds narration to the generated trailer based on movie subtitles. We utilize two pseudo-scores derived from the proposed framework as labels and thus train the model under a weakly-supervised learning paradigm, ensuring trailerness consistency for key shot selection and emotion consistency for key shot sorting, respectively. As a result, we can learn the proposed model solely based on movie-trailer pairs without any fine-grained annotations. Both objective experimental results and subjective user studies demonstrate the superior performance of our method over previous works. The code is available at https:\/\/github.com\/Dixin-Lab\/MMSC.",
        "location": "Guangzhou",
        "day": "August 31st",
        "hour": "09:40",
        "session": "AI, arts and Creativity"
    },
    {
        "id": "8642",
        "title": "Leveraging Large Language Models for Active Merchant Non-player Characters",
        "authors": "Byungjun Kim, Minju Kim, Dayeon Seo, Bugeun Kim",
        "abstract": "We highlight two significant issues leading to the passivity of current merchant non-player characters (NPCs): pricing and communication. While immersive interactions with active NPCs have been a focus, price negotiations between merchant NPCs and players remain underexplored. First, passive pricing refers to the limited ability of merchants to modify predefined item prices. Second, passive communication means that merchants can only interact with players in a scripted manner. To tackle these issues and create an active merchant NPC, we propose a merchant framework based on large language models (LLMs), called MART, which consists of an appraiser module and a negotiator module. We conducted two experiments to explore various implementation options under different training methods and LLM sizes, considering a range of possible game environments. Our findings indicate that finetuning methods, such as supervised finetuning (SFT) and knowledge distillation (KD), are effective in using smaller LLMs to implement active merchant NPCs. Additionally, we found three irregular cases arising from the responses of LLMs.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "10:00",
        "session": "AI and Arts (1\/2)",
        "poster_positions": "From board n88 to board n92"
    },
    {
        "id": "8676",
        "title": "SmartSpatial: Enhancing 3D Spatial Awareness in Stable Diffusion with a Novel Evaluation Framework",
        "authors": "Mao Xun Huang, Brian J Chan, Hen-Hsen Huang",
        "abstract": "Stable Diffusion models have made remarkable strides in generating photorealistic images from text prompts but often falter when tasked with accurately representing complex spatial arrangements, particularly involving intricate 3D relationships. \r\nTo address this limitation, we introduce SmartSpatial, an innovative approach that not only enhances the spatial arrangement capabilities of Stable Diffusion but also fosters AI-assisted creative workflows through 3D-aware conditioning and attention-guided mechanisms. \r\nSmartSpatial incorporates depth information injection and cross-attention control to ensure precise object placement, delivering notable improvements in spatial accuracy metrics. \r\nIn conjunction with SmartSpatial, we present SmartSpatialEval, a comprehensive evaluation framework that bridges computational spatial accuracy with qualitative artistic assessments. \r\nExperimental results show that SmartSpatial significantly outperforms existing methods, setting new benchmarks for spatial fidelity in AI-driven art and creativity.",
        "location": "Montreal",
        "day": "August 20th",
        "hour": "14:00",
        "session": "AI and Arts (2\/2)",
        "poster_positions": "From board n81 to board n87"
    }
]