李学龙* and 赵斌, “视频萃取,” 中国科学: 信息科学, 2021, 51(5): 695-734. [BibTeX] | [PDF]
B. Zhao, P. Han, and X. Li, "Vehicle perception from satellite," IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), vol. 46, no. 4, pp. 2545-2554, 2023, IEEE. [BibTeX] | [PDF]
@article{zhao2023vehicle,
  title={Vehicle perception from satellite},
  author={Zhao, Bin and Han, Pengfei and Li, Xuelong},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  volume={46},
  number={4},
  pages={2545--2554},
  year={2023},
  publisher={IEEE}
}    B. Zhao, H. Li, X. Lu, and X. Li*, "Reconstructive Sequence-Graph Network for Video Summarization," IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI), vol. 44, no. 5, pp. 2793-2801, 2022. [BibTeX] | [PDF]
@article{zhao2021reconstructive,
  title={Reconstructive sequence-graph network for video summarization},
  author={Zhao, Bin and Li, Haopeng and Lu, Xiaoqiang and Li, Xuelong},
            journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
  volume={44},
  number={5},
  pages={2793--2801},
  year={2021},
  publisher={IEEE}
}    B. Zhao, X. Li, and X. Lu, "HSA-RNN: Hierarchical Structure-Adaptive RNN for Video Summarization," Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7405-7414, 2018. [BibTeX] | [PDF]
@inproceedings{zhao2018hsa,
  title={Hsa-rnn: Hierarchical structure-adaptive rnn for video summarization},
  author={Zhao, Bin and Li, Xuelong and Lu, Xiaoqiang},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={7405--7414},
  year={2018}
}    K. Xu, C. Bai, X. Ma, D. Wang, B. Zhao, Z. Wang, X. Li, and W. Li, "Cross-domain policy adaptation via value-guided data filtering," Advances in Neural Information Processing Systems (NeurIPS), vol. 36, pp. 73395-73421, 2023. [BibTeX] | [PDF]
@article{xu2023cross,
  title={Cross-domain policy adaptation via value-guided data filtering},
  author={Xu, Kang and Bai, Chenjia and Ma, Xiaoteng and Wang, Dong and Zhao, Bin and Wang, Zhen and Li, Xuelong and Li, Wei},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={73395--73421},
  year={2023}
}    C. Yan, D. Qu, D. Xu, B. Zhao, Z. Wang, D. Wang, and X. Li, "Gs-slam: Dense visual slam with 3d gaussian splatting," Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19595-19604, 2024. [BibTeX] | [PDF]
@inproceedings{yan2024gs,
  title={Gs-slam: Dense visual slam with 3d gaussian splatting},
  author={Yan, Chi and Qu, Delin and Xu, Dan and Zhao, Bin and Wang, Zhigang and Wang, Dong and Li, Xuelong},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={19595--19604},
  year={2024}
}    D. Qu, C. Yan, D. Wang, J. Yin, Q. Chen, D. Xu, Y. Zhang, B. Zhao, and X. Li, "Implicit event-rgbd neural slam," Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19584-19594, 2024. [BibTeX] | [PDF]
@inproceedings{qu2024implicit,
  title={Implicit event-rgbd neural slam},
  author={Qu, Delin and Yan, Chi and Wang, Dong and Yin, Jie and Chen, Qizhi and Xu, Dan and Zhang, Yiting and Zhao, Bin and Li, Xuelong},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={19584--19594},
  year={2024}
}    Y. Tang, R. Zhang, Z. Guo, X. Ma, B. Zhao, Z. Wang, D. Wang, and X. Li, "Point-peft: Parameter-efficient fine-tuning for 3d pre-trained models," Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 38, no. 6, pp. 5171-5179, 2024. [BibTeX] | [PDF]
@inproceedings{tang2024point,
  title={Point-peft: Parameter-efficient fine-tuning for 3d pre-trained models},
  author={Tang, Yiwen and Zhang, Ray and Guo, Zoey and Ma, Xianzheng and Zhao, Bin and Wang, Zhigang and Wang, Dong and Li, Xuelong},
  booktitle={Proceedings of the AAAI conference on artificial intelligence},
  volume={38},
  number={6},
  pages={5171--5179},
  year={2024}
}    W. Xia, D. Wang, X. Pang, Z. Wang, B. Zhao, D. Hu, and X. Li, "Kinematic-aware prompting for generalizable articulated object manipulation with llms," 2024 IEEE International Conference on Robotics and Automation (ICRA), pp. 2073-2080, 2024, IEEE. [BibTeX] | [PDF]
@inproceedings{xia2024kinematic,
  title={Kinematic-aware prompting for generalizable articulated object manipulation with llms},
  author={Xia, Wenke and Wang, Dong and Pang, Xincheng and Wang, Zhigang and Zhao, Bin and Hu, Di and Li, Xuelong},
  booktitle={2024 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={2073--2080},
  year={2024},
  organization={IEEE}
}    M. Cui, Z. Wang, D. Wang, B. Zhao, and X. Li, "Color event enhanced single-exposure HDR imaging," Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 38, no. 2, pp. 1399-1407, 2024. [BibTeX] | [PDF]
@inproceedings{cui2024color,
  title={Color event enhanced single-exposure HDR imaging},
  author={Cui, Mengyao and Wang, Zhigang and Wang, Dong and Zhao, Bin and Li, Xuelong},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={38},
  number={2},
  pages={1399--1407},
  year={2024}
}    Y. Tang, R. Zhang, J. Liu, Z. Guo, B. Zhao, Z. Wang, P. Gao, H. Li, D. Wang, and X. Li, "Any2point: Empowering any-modality large models for efficient 3d understanding," European Conference on Computer Vision (ECCV), pp. 456-473, 2024. [BibTeX] | [PDF]
@inproceedings{tang2024any2point,
  title={Any2point: Empowering any-modality large models for efficient 3d understanding},
  author={Tang, Yiwen and Zhang, Ray and Liu, Jiaming and Guo, Zoey and Zhao, Bin and Wang, Zhigang and Gao, Peng and Li, Hongsheng and Wang, Dong and Li, Xuelong},
  booktitle={European Conference on Computer Vision},
  pages={456--473},
  year={2024},
  organization={Springer}
}    Z. Li, B. Zhao, and Y. Yuan, "Cyclic learning for binaural audio generation and localization," Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 26669-26678, 2024. [BibTeX] | [PDF]
@inproceedings{li2024cyclic,
  title={Cyclic learning for binaural audio generation and localization},
  author={Li, Zhaojian and Zhao, Bin and Yuan, Yuan},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={26669--26678},
  year={2024}
}    X. Gao, P. Zhang, D. Qu, D. Wang, Z. Wang, Y. Ding, and B. Zhao, "Learning 2d invariant affordance knowledge for 3d affordance grounding," Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 39, no. 3, pp. 3095-3103, 2025. [BibTeX] | [PDF]
@inproceedings{gao2025learning,
  title={Learning 2d invariant affordance knowledge for 3d affordance grounding},
  author={Gao, Xianqiang and Zhang, Pingrui and Qu, Delin and Wang, Dong and Wang, Zhigang and Ding, Yan and Zhao, Bin},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={39},
  number={3},
  pages={3095--3103},
  year={2025}
}    K. Liu, Z. Tang, D. Wang, Z. Wang, X. Li, and B. Zhao, "Coherent: Collaboration of heterogeneous multi-robot system with large language models," 2025 IEEE International Conference on Robotics and Automation (ICRA), pp. 10208-10214, 2025, IEEE. [BibTeX] | [PDF]
@inproceedings{liu2025coherent,
  title={Coherent: Collaboration of heterogeneous multi-robot system with large language models},
  author={Liu, Kehui and Tang, Zixin and Wang, Dong and Wang, Zhigang and Li, Xuelong and Zhao, Bin},
  booktitle={2025 IEEE International Conference on Robotics and Automation (ICRA)},
  pages={10208--10214},
  year={2025},
  organization={IEEE}
}    L. Jing, Y. Xue, X. Yan, C. Zheng, D. Wang, R. Zhang, Z. Wang, H. Fang, B. Zhao, and Z. Li, "X4d-sceneformer: Enhanced scene understanding on 4d point cloud videos through cross-modal knowledge transfer," Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 38, no. 3, pp. 2670-2678, 2024. [BibTeX] | [PDF]
@inproceedings{jing2024x4d,
  title={X4d-sceneformer: Enhanced scene understanding on 4d point cloud videos through cross-modal knowledge transfer},
  author={Jing, Linglin and Xue, Ying and Yan, Xu and Zheng, Chaoda and Wang, Dong and Zhang, Ruimao and Wang, Zhigang and Fang, Hui and Zhao, Bin and Li, Zhen},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
  volume={38},
  number={3},
  pages={2670--2678},
  year={2024}
}    L. Jing, Y. Ding, Y. Gao, Z. Wang, X. Yan, D. Wang, G. Schaefer, H. Fang, B. Zhao, and X. Li, "HPL-ESS: hybrid pseudo-labeling for unsupervised event-based semantic segmentation," Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23128-23137, 2024. [BibTeX] | [PDF]
@inproceedings{jing2024hpl,
  title={HPL-ESS: hybrid pseudo-labeling for unsupervised event-based semantic segmentation},
  author={Jing, Linglin and Ding, Yiming and Gao, Yunpeng and Wang, Zhigang and Yan, Xu and Wang, Dong and Schaefer, Gerald and Fang, Hui and Zhao, Bin and Li, Xuelong},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={23128--23137},
  year={2024}
}    H. He, C. Bai, L. Pan, W. Zhang, B. Zhao, and X. Li, "Learning an actionable discrete diffusion policy via large-scale actionless video pre-training," Advances in Neural Information Processing Systems (NeurIPS), vol. 37, pp. 31124-31153, 2024. [BibTeX] | [PDF]
@article{he2024learning,
  title={Learning an actionable discrete diffusion policy via large-scale actionless video pre-training},
  author={He, Haoran and Bai, Chenjia and Pan, Ling and Zhang, Weinan and Zhao, Bin and Li, Xuelong},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={31124--31153},
  year={2024}
}    D. Qu, Q. Chen, P. Zhang, X. Gao, B. Zhao, Z. Wang, D. Wang, and X. Li, "LiveScene: Language Embedding Interactive Radiance Fields for Physical Scene Control and Rendering," Advances in Neural Information Processing Systems (NeurIPS), vol. 37, pp. 12271-12292, 2024. [BibTeX] | [PDF]
@article{qu2024livescene,
  title={LiveScene: Language Embedding Interactive Radiance Fields for Physical Scene Control and Rendering},
  author={Qu, Delin and Chen, Qizhi and Zhang, Pingrui and Gao, Xianqiang and Zhao, Bin and Wang, Zhigang and Wang, Dong and Li, Xuelong},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={12271--12292},
  year={2024}
}    G. Li, B. Zhao, and X. Li, "Low-light image enhancement with sam-based structure priors and guidance," IEEE Transactions on Multimedia (TMM), vol. 26, pp. 10854-10866, 2024, IEEE. [BibTeX] | [PDF]
@article{li2024low,
  title={Low-light image enhancement with sam-based structure priors and guidance},
  author={Li, Guanlin and Zhao, Bin and Li, Xuelong},
  journal={IEEE Transactions on Multimedia},
  volume={26},
  pages={10854--10866},
  year={2024},
  publisher={IEEE}
}    G. Lan, Q. Ma, Y. Yang, Z. Wang, D. Wang, X. Li, and B. Zhao, "Efficient Diffusion as Low Light Enhancer," Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR), pp. 21277-21286, 2025. [BibTeX] | [PDF]
@inproceedings{lan2025efficient,
  title={Efficient Diffusion as Low Light Enhancer},
  author={Lan, Guanzhou and Ma, Qianli and Yang, Yuqi and Wang, Zhigang and Wang, Dong and Li, Xuelong and Zhao, Bin},
  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages={21277--21286},
  year={2025}
}    Y. Yuan, Z. Li, and B. Zhao, "A survey of multimodal learning: Methods, applications, and future," ACM Computing Surveys (ACM Comput. Surv.), vol. 57, no. 7, pp. 1-34, 2025, ACM New York, NY. [BibTeX] | [PDF]
@article{yuan2025survey,
  title={A survey of multimodal learning: Methods, applications, and future},
  author={Yuan, Yuan and Li, Zhaojian and Zhao, Bin},
  journal={ACM Computing Surveys},
  volume={57},
  number={7},
  pages={1--34},
  year={2025},
  publisher={ACM New York, NY}
}    K. Liu, C.Guan, Z Jia, ..., B. Zhao, and X. Li, "FastUMI: A Scalable and Hardware-Independent Universal Manipulation Interface with Dataset," arXiv preprint arXiv:2409.19499, 2024. [BibTeX] | [PDF]
@article{liu2024fastumi,
  title={FastUMI: A Scalable and Hardware-Independent Universal Manipulation Interface with Dataset},
  author={Liu, Kehui and Guan, Chuyue and Jia, Zhongjie and Wu, Ziniu and Liu, Xin and Wang, Tianyu and Liang, Shuai and Chen, Pengan and Zhang, Pingrui and Song, Haoming and others},
  journal={arXiv preprint arXiv:2409.19499},
  year={2024}
}    Y. Yao, S. Liu, H. Song, D. Qu, Q. Chen, Y. Ding, B. Zhao, Z. Wang, X. Li, and D. Wang, "Think Small, Act Big: Primitive Prompt Learning for Lifelong Robot Manipulation," Proceedings of the Computer Vision and Pattern Recognition Conference (CVPR), pp. 22573-22583, 2025. [BibTeX] | [PDF]
@inproceedings{yao2025think,
  title={Think Small, Act Big: Primitive Prompt Learning for Lifelong Robot Manipulation},
  author={Yao, Yuanqi and Liu, Siao and Song, Haoming and Qu, Delin and Chen, Qizhi and Ding, Yan and Zhao, Bin and Wang, Zhigang and Li, Xuelong and Wang, Dong},
  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages={22573--22583},
  year={2025}
}    H. Song, D. Qu, Y. Yao, Q. Chen, Q. Lv, Y. Tang, M. Shi, G. Ren, M. Yao, and B. Zhao, "Hume: Introducing System-2 Thinking in Visual-Language-Action Model," arXiv preprint arXiv:2505.21432, 2025. [BibTeX] | [PDF]
@article{song2025hume,
  title={Hume: Introducing System-2 Thinking in Visual-Language-Action Model},
  author={Song, Haoming and Qu, Delin and Yao, Yuanqi and Chen, Qizhi and Lv, Qi and Tang, Yiwen and Shi, Modi and Ren, Guanghui and Yao, Maoqing and Zhao, Bin and Wang, Dong and Li, Xuelong},
  journal={arXiv preprint arXiv:2505.21432},
  year={2025}
}    P. Zhang, Y. Su, P. Wu, D. An, L. Zhang, Z. Wang, D. Wang, Y. Ding, B. Zhao, and X. Li, "Cross from Left to Right Brain: Adaptive Text Dreamer for Vision-and-Language Navigation," arXiv preprint arXiv:2505.20897, 2025. [BibTeX] | [PDF]
@article{zhang2025cross,
  title={Cross from Left to Right Brain: Adaptive Text Dreamer for Vision-and-Language Navigation},
  author={Zhang, Pingrui and Su, Yifei and Wu, Pengyuan and An, Dong and Zhang, Li and Wang, Zhigang and Wang, Dong and Ding, Yan and Zhao, Bin and Li, Xuelong},
  journal={arXiv preprint arXiv:2505.20897},
  year={2025}
}    Z. Wang, Y. Su, C. Li, D. Wang, Y. Huang, X. Li, and B. Zhao, "Open-Vocabulary Octree-Graph for 3D Scene Understanding," Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV), pp. 7037-7047, 2025. [BibTeX] | [PDF]
@inproceedings{wang2025open,
  title={Open-Vocabulary Octree-Graph for 3D Scene Understanding},
  author={Wang, Zhigang and Su, Yifei and Li, Chenhui and Wang, Dong and Huang, Yan and Li, Xuelong and Zhao, Bin},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={7037--7047},
  year={2025}
}    Y. Gao, C. Li, Z. You, ..., B. Zhao, and X. Li, "OpenFly: A Comprehensive Platform for Aerial Vision-Language Navigation," arXiv preprint arXiv:2502.18041, 2025. [BibTeX] | [PDF]
@article{gao2025openfly,
  title={OpenFly: A Comprehensive Platform for Aerial Vision-Language Navigation},
  author={Gao, Yunpeng and Li, Chenhui and You, Zhongrui and Liu, Junli and Li, Zhen and Chen, Pengan and Chen, Qizhi and Tang, Zhonghan and Wang, Liansheng and Yang, Penghui and others},
  journal={arXiv preprint arXiv:2502.18041},
  year={2025}
}    J. Liu, Q. Chen, Z. Wang, Y. Tang, Y. Zhang, C. Yan, D. Wang, X. Li, and B. Zhao, "AerialVG: A Challenging Benchmark for Aerial Visual Grounding by Exploring Positional Relations," arXiv preprint arXiv:2504.07836, 2025. [BibTeX] | [PDF]
@article{liu2025aerialvg,
  title={AerialVG: A Challenging Benchmark for Aerial Visual Grounding by Exploring Positional Relations},
  author={Liu, Junli and Chen, Qizhi and Wang, Zhigang and Tang, Yiwen and Zhang, Yiting and Yan, Chi and Wang, Dong and Li, Xuelong and Zhao, Bin},
  journal={arXiv preprint arXiv:2504.07836},
  year={2025}
}    D. Qu, H. Song, Q. Chen, ..., B. Zhao, and D. Wang, "Embodiedonevision: Interleaved vision-text-action pretraining for general robot control," arXiv e-prints, arXiv:2508.21112, 2025. [BibTeX] | [PDF]
@article{qu2025embodiedonevision,
  title={Embodiedonevision: Interleaved vision-text-action pretraining for general robot control},
  author={Qu, Delin and Song, Haoming and Chen, Qizhi and Chen, Zhaoqing and Gao, Xianqiang and Ye, Xinyi and Lv, Qi and Shi, Modi and Ren, Guanghui and Ruan, Cheng and others},
  journal={arXiv e-prints},
  pages={arXiv--2508},
  year={2025}
}    D. Qu, H. Song, Q. Chen, ..., B. Zhao, D. Wang, and X. Li, "SpatialVLA: Exploring Spatial Representations for Visual-Language-Action Model," arXiv preprint arXiv:2501.15830, 2025. [BibTeX] | [PDF]
@article{qu2025spatialvla,
  title={Spatialvla: Exploring spatial representations for visual-language-action model},
  author={Qu, Delin and Song, Haoming and Chen, Qizhi and Yao, Yuanqi and Ye, Xinyi and Ding, Yan and Wang, Zhigang and Gu, JiaYuan and Zhao, Bin and Wang, Dong and others},
  journal={arXiv preprint arXiv:2501.15830},
  year={2025}
}