diff --git "a/app.py" "b/app.py" deleted file mode 100644--- "a/app.py" +++ /dev/null @@ -1,5629 +0,0 @@ -import streamlit as st - -# Initialize the slide groups in session state on first run. -if "slide_groups" not in st.session_state: - st.session_state.slide_groups = [ - { - "group": "Slide 1: Introduction", - "content": r""" -**Title:** AI Toolbox: 20 Papers in 5 Minutes -**Goal:** Show how these topics (Torch, Ollama, Deepseek, SFT, knowledge distillation, crowdsourcing, etc.) tie together into an end-to-end AI pipeline. -**Media:** Quick intro audio & a short video clip highlighting AI breakthroughs. - """ - }, - { - "group": "Slides 2–3: Torch (PyTorch Foundations)", - "content": r""" -**Paper 1** -*Reference:* Paszke, A. et al. “PyTorch: An Imperative Style, High-Performance Deep Learning Library.” arXiv:1912.01703 (2019) -*Key Points:* -- Dynamic computation graphs for rapid prototyping. -- Strong GPU acceleration and broad community support. -*Presentation Element:* Brief code snippet in Python + a Mermaid flowchart showing how forward/backprop flows in PyTorch. - -**Paper 2** -*Reference:* Paszke, A. et al. “Automatic Differentiation in PyTorch.” arXiv:1707.?? (Hypothetical reference) -*Key Points:* -- Core mechanism behind autograd. -- How tensor operations are tracked and reversed for gradients. -*Presentation Element:* Minimal slides highlighting computational graph merges with HPC concepts. - """ - }, - { - "group": "Slides 4–5: Ollama & LLaMA-Based Models", - "content": r""" -**Paper 3** -*Reference:* Touvron, H. et al. “LLaMA: Open and Efficient Foundation Language Models.” arXiv:2302.13971 (2023) -*Key Points:* -- Architecture, training efficiency, and open-source benefits. -- Relevance to Ollama (lightweight local LLaMA inference). -*Presentation Element:* Short video demo of an Ollama prompt or model reply. - -**Paper 4** -*Reference:* Zhang, M. et al. “Exploring LLaMA Derivatives for Local Inference.” arXiv:2303.???? (Hypothetical) -*Key Points:* -- Techniques for running large models on consumer-grade hardware. -- Model quantization, CPU/GPU scheduling. -*Presentation Element:* Mermaid sequence diagram comparing server-based vs. local inference pipelines. - """ - }, - { - "group": "Slides 6–7: Deepseek MoE + Chain of Thought (CoT)", - "content": r""" -**Paper 5** -*Reference:* Fedus, W., Zoph, B., Shazeer, N. “Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity.” arXiv:2101.03961 (2021) -*Key Points:* -- Mixture-of-Experts (MoE) approach to scale large models. -- Efficiency gains via sparse routing. -*Presentation Element:* Visual MoE block diagram with color-coded experts. - -**Paper 6** -*Reference:* Wei, J. et al. “Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.” arXiv:2201.11903 (2022) -*Key Points:* -- Step-by-step reasoning prompts improve logical consistency. -- Potential synergy with MoE for specialized “reasoning experts.” -*Presentation Element:* Mermaid mind map illustrating short CoT vs. detailed CoT. - """ - }, - { - "group": "Slides 8–9: Hugging Face SFT Trainer", - "content": r""" -**Paper 7** -*Reference:* Wolf, T. et al. “Transformers: State-of-the-Art Natural Language Processing.” arXiv:1910.03771 (2020) -*Key Points:* -- Core library behind Hugging Face’s ecosystem. -- Transformer architecture fundamentals. -*Presentation Element:* Show how SFTTrainer (hypothetical name) builds on Trainer for supervised finetuning. - -**Paper 8** -*Reference:* Houlsby, N. et al. “Parameter-Efficient Transfer Learning for NLP.” arXiv:1902.00751 (2019) -*Key Points:* -- Techniques like adapters, LoRA, or selective layer freezing. -- Impact on training efficiency and model size. -*Presentation Element:* A side-by-side bar chart showing reduction in GPU hours with parameter-efficient methods. - """ - }, - { - "group": "Slides 10–11: Knowledge Distillation & Mermaid Graphs", - "content": r""" -**Paper 9** -*Reference:* Hinton, G., Vinyals, O., Dean, J. “Distilling the Knowledge in a Neural Network.” arXiv:1503.02531 (2015) -*Key Points:* -- Transfer knowledge from large “teacher” models to small “student” models. -- Temperature scaling and teacher-student training. -*Presentation Element:* Mermaid flowchart detailing teacher–student relationships. - -**Paper 10** -*Reference:* Chen, X. et al. “Graph-Based Knowledge Distillation for Neural Networks.” arXiv:2105.???? (Hypothetical) -*Key Points:* -- Represent model layers and hidden states as nodes & edges. -- Synergy with SFT and domain adaptation. -*Presentation Element:* Mermaid graph diagram linking teacher network nodes to student network nodes. - """ - }, - { - "group": "Slides 12–13: Crowdsourcing & Agents for Evaluation", - "content": r""" -**Paper 11** -*Reference:* Callison-Burch, C. “Fast, Cheap, and Creative: Evaluating Translation Quality Using Amazon’s Mechanical Turk.” arXiv:0907.5225 (2009) -*Key Points:* -- Crowdsourcing pipeline for large-scale text evaluation. -- Reliability strategies: gold standards, inter-annotator agreement. -*Presentation Element:* Timeline comparing tasks for crowdworkers vs. automated agents. - -**Paper 12** -*Reference:* Nie, Y. et al. “Adversarial NLI: A New Benchmark for Natural Language Understanding.” arXiv:1910.14599 (2019) -*Key Points:* -- Human-and-model-in-the-loop adversarial examples. -- Incremental data curation to improve robustness. -*Presentation Element:* Short audio explanation of adversarial example refinement. - """ - }, - { - "group": "Slides 14–15: Python + Gradio/Streamlit", - "content": r""" -**Paper 13** -*Reference:* Abid, A. et al. “Gradio: A User Interface for Interactive Machine Learning.” arXiv:2101.???? (Hypothetical) -*Key Points:* -- Build quick demos and capture user feedback. -- Invaluable for crowdsourced data collection and real-time model updates. -*Presentation Element:* 10-second video demo of a Gradio UI (e.g. a chatbot or image classifier). - -**Paper 14** -*Reference:* [Streamlit Team], “Streamlit: Democratizing Data App Creation.” arXiv:2004.???? (Hypothetical) -*Key Points:* -- Turning Python scripts into web apps effortlessly. -- Useful for HPC dashboards and debugging distributed training. -*Presentation Element:* Animated slides showing how to add interactive widgets with minimal code. - """ - }, - { - "group": "Slides 16–17: HPC for Python-Based AI", - "content": r""" -**Paper 15** -*Reference:* Shoeybi, M. et al. “Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.” arXiv:1909.08053 (2019) -*Key Points:* -- Scaling large models via model parallelism on HPC clusters. -- Integration with NVIDIA libraries (e.g. NCCL). -*Presentation Element:* Mermaid architecture diagram illustrating parallel pipelines. - -**Paper 16** -*Reference:* Huang, Y. et al. “GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism.” arXiv:1811.06965 (2019) -*Key Points:* -- Overlap of communication and computation for HPC efficiency. -- Synergy with MoE or large LLaMA models. -*Presentation Element:* Throughput vs. latency charts and an HPC cluster image. - """ - }, - { - "group": "Slides 18–19: Semantic & Episodic Memory + RLHF", - "content": r""" -**Paper 17** -*Reference:* Ouyang, X. et al. “Integrating Episodic and Semantic Memory for Task-Oriented Dialogue.” arXiv:2105.???? (Hypothetical) -*Key Points:* -- Differentiate short-term episodic from long-term semantic context. -- Improves consistency and factual correctness in dialogue. -*Presentation Element:* Mermaid diagram contrasting ephemeral vs. persistent memory flows. - -**Paper 18** -*Reference:* Ouyang, X. et al. “Training Language Models to Follow Instructions with Human Feedback.” arXiv:2203.02155 (2022) -*Key Points:* -- Reinforcement Learning from Human Feedback (RLHF). -- Align model outputs with user preferences and ethical guidelines. -*Presentation Element:* RLHF pseudo-code snippet and a timeline of preference collection. - """ - }, - { - "group": "Slides 20–21: Transfer Learning & “Learning for Good”", - "content": r""" -**Paper 19** -*Reference:* Ruder, S. “A Survey on Transfer Learning for NLP.” arXiv:1910.?? (2019) -*Key Points:* -- Overview of transfer learning strategies (fine-tuning, adapters, multitask learning). -- Quickly customize large pre-trained models. -*Presentation Element:* Graph of performance gains vs. training time. - -**Paper 20** -*Reference:* Zhang, Y., Yang, Q. “A Survey on Multi-Task Learning.” arXiv:1707.08114 (2017) -*Key Points:* -- Train one model on multiple tasks to share representations. -- Synergy with “Learning for Good” scenarios (e.g., medical, climate). -*Presentation Element:* Mermaid multi-task diagram showing convergence in shared layers. - """ - }, - { - "group": "Slide 22: Closing & Next Steps", - "content": r""" -**Key Takeaways:** -- **Integration:** Every paper contributes to an end-to-end AI pipeline—from HPC scaling to crowdsourced evaluation. -- **Modular Approach:** Combining PyTorch, Hugging Face SFT, and knowledge distillation leads to efficient model development. -- **Interactive Demonstrations:** Leveraging Gradio/Streamlit and RLHF creates user-friendly, human-centric AI experiences. -- **Future Work:** Explore deeper synergies among MoE, HPC, and memory-based architectures. - -**Media:** -- Concluding audio clip. -- (Optionally) a final Mermaid diagram linking all stages: data ingestion → HPC training → crowdsourcing → RLHF → model deployment. - """ - } - ] - st.session_state.current_index = 0 # Initialize the current slide index - - -# Set up the page configuration -st.set_page_config(page_title="AI Presentation Outline", layout="wide") -st.title("AI Toolbox Presentation Outline") - -# Sidebar: Navigation and slide group addition -st.sidebar.header("Navigation") - -# --- Option to add a new slide group --- -with st.sidebar.expander("Add New Slide Group"): - with st.form("new_slide_form"): - new_group = st.text_input("Slide Group Title") - new_content = st.text_area("Slide Group Content (Markdown)", height=200) - submitted = st.form_submit_button("Add Slide Group") - if submitted: - if new_group.strip() and new_content.strip(): - st.session_state.slide_groups.append({ - "group": new_group.strip(), - "content": new_content.strip() - }) - st.success(f"Added slide group: {new_group}") - else: - st.error("Please provide both a title and content.") - -# --- Slide group selector --- -slide_titles = [slide["group"] for slide in st.session_state.slide_groups] -# Use a selectbox whose index is synced with session_state.current_index -selected_index = st.sidebar.selectbox( - "Select Slide Group", - range(len(slide_titles)), - index=st.session_state.current_index, - format_func=lambda i: slide_titles[i] -) -st.session_state.current_index = selected_index - -# --- Navigation buttons --- -cols = st.sidebar.columns(2) -if cols[0].button("⟨ Previous"): - st.session_state.current_index = max(st.session_state.current_index - 1, 0) -if cols[1].button("Next ⟩"): - st.session_state.current_index = min(st.session_state.current_index + 1, len(slide_titles) - 1) - -# Main: Display the selected slide group's details -current_slide = st.session_state.slide_groups[st.session_state.current_index] -st.header(current_slide["group"]) -st.markdown(current_slide["content"], unsafe_allow_html=True) - - - -SlideDeck = ''' - - -import streamlit as st - -# Initialize the slide groups in session state on first run. -if "slide_groups" not in st.session_state: - st.session_state.slide_groups = [ - { - "group": "Slide 1: Introduction", - "content": r""" -**Title:** AI Toolbox: 20 Papers in 5 Minutes -**Goal:** Show how these topics (Torch, Ollama, Deepseek, SFT, knowledge distillation, crowdsourcing, etc.) tie together into an end-to-end AI pipeline. -**Media:** Quick intro audio & a short video clip highlighting AI breakthroughs. - """ - }, - { - "group": "Slides 2–3: Torch (PyTorch Foundations)", - "content": r""" -**Paper 1** -*Reference:* Paszke, A. et al. “PyTorch: An Imperative Style, High-Performance Deep Learning Library.” arXiv:1912.01703 (2019) -*Key Points:* -- Dynamic computation graphs for rapid prototyping. -- Strong GPU acceleration and broad community support. -*Presentation Element:* Brief code snippet in Python + a Mermaid flowchart showing how forward/backprop flows in PyTorch. - -**Paper 2** -*Reference:* Paszke, A. et al. “Automatic Differentiation in PyTorch.” arXiv:1707.?? (Hypothetical reference) -*Key Points:* -- Core mechanism behind autograd. -- How tensor operations are tracked and reversed for gradients. -*Presentation Element:* Minimal slides highlighting computational graph merges with HPC concepts. - """ - }, - { - "group": "Slides 4–5: Ollama & LLaMA-Based Models", - "content": r""" -**Paper 3** -*Reference:* Touvron, H. et al. “LLaMA: Open and Efficient Foundation Language Models.” arXiv:2302.13971 (2023) -*Key Points:* -- Architecture, training efficiency, and open-source benefits. -- Relevance to Ollama (lightweight local LLaMA inference). -*Presentation Element:* Short video demo of an Ollama prompt or model reply. - -**Paper 4** -*Reference:* Zhang, M. et al. “Exploring LLaMA Derivatives for Local Inference.” arXiv:2303.???? (Hypothetical) -*Key Points:* -- Techniques for running large models on consumer-grade hardware. -- Model quantization, CPU/GPU scheduling. -*Presentation Element:* Mermaid sequence diagram comparing server-based vs. local inference pipelines. - """ - }, - { - "group": "Slides 6–7: Deepseek MoE + Chain of Thought (CoT)", - "content": r""" -**Paper 5** -*Reference:* Fedus, W., Zoph, B., Shazeer, N. “Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity.” arXiv:2101.03961 (2021) -*Key Points:* -- Mixture-of-Experts (MoE) approach to scale large models. -- Efficiency gains via sparse routing. -*Presentation Element:* Visual MoE block diagram with color-coded experts. - -**Paper 6** -*Reference:* Wei, J. et al. “Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.” arXiv:2201.11903 (2022) -*Key Points:* -- Step-by-step reasoning prompts improve logical consistency. -- Potential synergy with MoE for specialized “reasoning experts.” -*Presentation Element:* Mermaid mind map illustrating short CoT vs. detailed CoT. - """ - }, - { - "group": "Slides 8–9: Hugging Face SFT Trainer", - "content": r""" -**Paper 7** -*Reference:* Wolf, T. et al. “Transformers: State-of-the-Art Natural Language Processing.” arXiv:1910.03771 (2020) -*Key Points:* -- Core library behind Hugging Face’s ecosystem. -- Transformer architecture fundamentals. -*Presentation Element:* Show how SFTTrainer (hypothetical name) builds on Trainer for supervised finetuning. - -**Paper 8** -*Reference:* Houlsby, N. et al. “Parameter-Efficient Transfer Learning for NLP.” arXiv:1902.00751 (2019) -*Key Points:* -- Techniques like adapters, LoRA, or selective layer freezing. -- Impact on training efficiency and model size. -*Presentation Element:* A side-by-side bar chart showing reduction in GPU hours with parameter-efficient methods. - """ - }, - { - "group": "Slides 10–11: Knowledge Distillation & Mermaid Graphs", - "content": r""" -**Paper 9** -*Reference:* Hinton, G., Vinyals, O., Dean, J. “Distilling the Knowledge in a Neural Network.” arXiv:1503.02531 (2015) -*Key Points:* -- Transfer knowledge from large “teacher” models to small “student” models. -- Temperature scaling and teacher-student training. -*Presentation Element:* Mermaid flowchart detailing teacher–student relationships. - -**Paper 10** -*Reference:* Chen, X. et al. “Graph-Based Knowledge Distillation for Neural Networks.” arXiv:2105.???? (Hypothetical) -*Key Points:* -- Represent model layers and hidden states as nodes & edges. -- Synergy with SFT and domain adaptation. -*Presentation Element:* Mermaid graph diagram linking teacher network nodes to student network nodes. - """ - }, - { - "group": "Slides 12–13: Crowdsourcing & Agents for Evaluation", - "content": r""" -**Paper 11** -*Reference:* Callison-Burch, C. “Fast, Cheap, and Creative: Evaluating Translation Quality Using Amazon’s Mechanical Turk.” arXiv:0907.5225 (2009) -*Key Points:* -- Crowdsourcing pipeline for large-scale text evaluation. -- Reliability strategies: gold standards, inter-annotator agreement. -*Presentation Element:* Timeline comparing tasks for crowdworkers vs. automated agents. - -**Paper 12** -*Reference:* Nie, Y. et al. “Adversarial NLI: A New Benchmark for Natural Language Understanding.” arXiv:1910.14599 (2019) -*Key Points:* -- Human-and-model-in-the-loop adversarial examples. -- Incremental data curation to improve robustness. -*Presentation Element:* Short audio explanation of adversarial example refinement. - """ - }, - { - "group": "Slides 14–15: Python + Gradio/Streamlit", - "content": r""" -**Paper 13** -*Reference:* Abid, A. et al. “Gradio: A User Interface for Interactive Machine Learning.” arXiv:2101.???? (Hypothetical) -*Key Points:* -- Build quick demos and capture user feedback. -- Invaluable for crowdsourced data collection and real-time model updates. -*Presentation Element:* 10-second video demo of a Gradio UI (e.g. a chatbot or image classifier). - -**Paper 14** -*Reference:* [Streamlit Team], “Streamlit: Democratizing Data App Creation.” arXiv:2004.???? (Hypothetical) -*Key Points:* -- Turning Python scripts into web apps effortlessly. -- Useful for HPC dashboards and debugging distributed training. -*Presentation Element:* Animated slides showing how to add interactive widgets with minimal code. - """ - }, - { - "group": "Slides 16–17: HPC for Python-Based AI", - "content": r""" -**Paper 15** -*Reference:* Shoeybi, M. et al. “Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.” arXiv:1909.08053 (2019) -*Key Points:* -- Scaling large models via model parallelism on HPC clusters. -- Integration with NVIDIA libraries (e.g. NCCL). -*Presentation Element:* Mermaid architecture diagram illustrating parallel pipelines. - -**Paper 16** -*Reference:* Huang, Y. et al. “GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism.” arXiv:1811.06965 (2019) -*Key Points:* -- Overlap of communication and computation for HPC efficiency. -- Synergy with MoE or large LLaMA models. -*Presentation Element:* Throughput vs. latency charts and an HPC cluster image. - """ - }, - { - "group": "Slides 18–19: Semantic & Episodic Memory + RLHF", - "content": r""" -**Paper 17** -*Reference:* Ouyang, X. et al. “Integrating Episodic and Semantic Memory for Task-Oriented Dialogue.” arXiv:2105.???? (Hypothetical) -*Key Points:* -- Differentiate short-term episodic from long-term semantic context. -- Improves consistency and factual correctness in dialogue. -*Presentation Element:* Mermaid diagram contrasting ephemeral vs. persistent memory flows. - -**Paper 18** -*Reference:* Ouyang, X. et al. “Training Language Models to Follow Instructions with Human Feedback.” arXiv:2203.02155 (2022) -*Key Points:* -- Reinforcement Learning from Human Feedback (RLHF). -- Align model outputs with user preferences and ethical guidelines. -*Presentation Element:* RLHF pseudo-code snippet and a timeline of preference collection. - """ - }, - { - "group": "Slides 20–21: Transfer Learning & “Learning for Good”", - "content": r""" -**Paper 19** -*Reference:* Ruder, S. “A Survey on Transfer Learning for NLP.” arXiv:1910.?? (2019) -*Key Points:* -- Overview of transfer learning strategies (fine-tuning, adapters, multitask learning). -- Quickly customize large pre-trained models. -*Presentation Element:* Graph of performance gains vs. training time. - -**Paper 20** -*Reference:* Zhang, Y., Yang, Q. “A Survey on Multi-Task Learning.” arXiv:1707.08114 (2017) -*Key Points:* -- Train one model on multiple tasks to share representations. -- Synergy with “Learning for Good” scenarios (e.g., medical, climate). -*Presentation Element:* Mermaid multi-task diagram showing convergence in shared layers. - """ - }, - { - "group": "Slide 22: Closing & Next Steps", - "content": r""" -**Key Takeaways:** -- **Integration:** Every paper contributes to an end-to-end AI pipeline—from HPC scaling to crowdsourced evaluation. -- **Modular Approach:** Combining PyTorch, Hugging Face SFT, and knowledge distillation leads to efficient model development. -- **Interactive Demonstrations:** Leveraging Gradio/Streamlit and RLHF creates user-friendly, human-centric AI experiences. -- **Future Work:** Explore deeper synergies among MoE, HPC, and memory-based architectures. - -**Media:** -- Concluding audio clip. -- (Optionally) a final Mermaid diagram linking all stages: data ingestion → HPC training → crowdsourcing → RLHF → model deployment. - """ - } - ] - st.session_state.current_index = 0 # Initialize the current slide index - - -# Set up the page configuration -st.set_page_config(page_title="AI Presentation Outline", layout="wide") -st.title("AI Toolbox Presentation Outline") - -# Sidebar: Navigation and slide group addition -st.sidebar.header("Navigation") - -# --- Option to add a new slide group --- -with st.sidebar.expander("Add New Slide Group"): - with st.form("new_slide_form"): - new_group = st.text_input("Slide Group Title") - new_content = st.text_area("Slide Group Content (Markdown)", height=200) - submitted = st.form_submit_button("Add Slide Group") - if submitted: - if new_group.strip() and new_content.strip(): - st.session_state.slide_groups.append({ - "group": new_group.strip(), - "content": new_content.strip() - }) - st.success(f"Added slide group: {new_group}") - else: - st.error("Please provide both a title and content.") - -# --- Slide group selector --- -slide_titles = [slide["group"] for slide in st.session_state.slide_groups] -# Use a selectbox whose index is synced with session_state.current_index -selected_index = st.sidebar.selectbox( - "Select Slide Group", - range(len(slide_titles)), - index=st.session_state.current_index, - format_func=lambda i: slide_titles[i] -) -st.session_state.current_index = selected_index - -# --- Navigation buttons --- -cols = st.sidebar.columns(2) -if cols[0].button("⟨ Previous"): - st.session_state.current_index = max(st.session_state.current_index - 1, 0) -if cols[1].button("Next ⟩"): - st.session_state.current_index = min(st.session_state.current_index + 1, len(slide_titles) - 1) - -# Main: Display the selected slide group's details -current_slide = st.session_state.slide_groups[st.session_state.current_index] -st.header(current_slide["group"]) -st.markdown(current_slide["content"], unsafe_allow_html=True) - - - - - - - - - - - - - - - - - -# ------- Add this onto the slides! - - - -### State-of-the-Art Techniques for Self Reward AI - -1. 🔍 Reinforcement Learning: AI agents learn to evaluate their actions by receiving rewards or penalties based on their performance. -2. 💡 Self-Supervision: AI models learn to generate their own labels or rewards, reducing the need for expensive human-annotated data. -3. 🧠 Metacognition: AI systems are equipped with the ability to monitor, evaluate, and regulate their own cognitive processes. - -Examples: -- 🚑 Telemed: AI assistants that can self-evaluate their medical advice and identify areas for improvement. -- 🤖 AI Developers: AI models that can critique their code or outputs and provide self-feedback for iterative refinement. - -### Glossary (Keywords) - -- 🤖 **AI**: Artificial Intelligence -- 🧠 **Metacognition**: Awareness and understanding of one's thought processes -- 🔄 **Reinforcement Learning**: Learning based on rewards and penalties -- 🔍 **Self-Supervision**: Learning from self-generated labels or rewards -- 💡 **Self-Reward**: Ability to evaluate and score one's own performance -- 🚑 **Telemed**: Telemedicine, remote delivery of healthcare services - -```mermaid -graph LR - AI --> Metacognition - AI --> ReinforcementLearning - AI --> SelfSupervision - SelfSupervision --> SelfReward - Metacognition --> SelfReward - ReinforcementLearning --> SelfReward - SelfReward --> Telemed -``` - - -# 🩺🔍 Search Results -### 02 Dec 2023 | [Principle-Driven Self-Alignment of Language Models from Scratch with Minimal Human Supervision](https://arxiv.org/abs/2305.03047) | [⬇️](https://arxiv.org/pdf/2305.03047) -*Zhiqing Sun, Yikang Shen, Qinhong Zhou, Hongxin Zhang, Zhenfang Chen, David Cox, Yiming Yang, Chuang Gan* - - Recent AI-assistant agents, such as ChatGPT, predominantly rely on supervised -fine-tuning (SFT) with human annotations and reinforcement learning from human -feedback (RLHF) to align the output of large language models (LLMs) with human -intentions, ensuring they are helpful, ethical, and reliable. However, this -dependence can significantly constrain the true potential of AI-assistant -agents due to the high cost of obtaining human supervision and the related -issues on quality, reliability, diversity, self-consistency, and undesirable -biases. To address these challenges, we propose a novel approach called -SELF-ALIGN, which combines principle-driven reasoning and the generative power -of LLMs for the self-alignment of AI agents with minimal human supervision. Our -approach encompasses four stages: first, we use an LLM to generate synthetic -prompts, and a topic-guided method to augment the prompt diversity; second, we -use a small set of human-written principles for AI models to follow, and guide -the LLM through in-context learning from demonstrations (of principles -application) to produce helpful, ethical, and reliable responses to user's -queries; third, we fine-tune the original LLM with the high-quality -self-aligned responses so that the resulting model can generate desirable -responses for each query directly without the principle set and the -demonstrations anymore; and finally, we offer a refinement step to address the -issues of overly-brief or indirect responses. Applying SELF-ALIGN to the -LLaMA-65b base language model, we develop an AI assistant named Dromedary. With -fewer than 300 lines of human annotations (including < 200 seed prompts, 16 -generic principles, and 5 exemplars for in-context learning). Dromedary -significantly surpasses the performance of several state-of-the-art AI systems, -including Text-Davinci-003 and Alpaca, on benchmark datasets with various -settings. - ---------------- - -### 15 Dec 2022 | [Constitutional AI: Harmlessness from AI Feedback](https://arxiv.org/abs/2212.08073) | [⬇️](https://arxiv.org/pdf/2212.08073) -*Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, Jared Kaplan* - - As AI systems become more capable, we would like to enlist their help to -supervise other AIs. We experiment with methods for training a harmless AI -assistant through self-improvement, without any human labels identifying -harmful outputs. The only human oversight is provided through a list of rules -or principles, and so we refer to the method as 'Constitutional AI'. The -process involves both a supervised learning and a reinforcement learning phase. -In the supervised phase we sample from an initial model, then generate -self-critiques and revisions, and then finetune the original model on revised -responses. In the RL phase, we sample from the finetuned model, use a model to -evaluate which of the two samples is better, and then train a preference model -from this dataset of AI preferences. We then train with RL using the preference -model as the reward signal, i.e. we use 'RL from AI Feedback' (RLAIF). As a -result we are able to train a harmless but non-evasive AI assistant that -engages with harmful queries by explaining its objections to them. Both the SL -and RL methods can leverage chain-of-thought style reasoning to improve the -human-judged performance and transparency of AI decision making. These methods -make it possible to control AI behavior more precisely and with far fewer human -labels. - ---------------- - -### 09 Oct 2023 | [SALMON: Self-Alignment with Principle-Following Reward Models](https://arxiv.org/abs/2310.05910) | [⬇️](https://arxiv.org/pdf/2310.05910) -*Zhiqing Sun, Yikang Shen, Hongxin Zhang, Qinhong Zhou, Zhenfang Chen, David Cox, Yiming Yang, Chuang Gan* - - Supervised Fine-Tuning (SFT) on response demonstrations combined with -Reinforcement Learning from Human Feedback (RLHF) constitutes a powerful -paradigm for aligning LLM-based AI agents. However, a significant limitation of -such an approach is its dependency on high-quality human annotations, making -its application to intricate tasks challenging due to difficulties in obtaining -consistent response demonstrations and in-distribution response preferences. -This paper presents a novel approach, namely SALMON (Self-ALignMent with -principle-fOllowiNg reward models), to align base language models with minimal -human supervision, using only a small set of human-defined principles, yet -achieving superior performance. Central to our approach is a -principle-following reward model. Trained on synthetic preference data, this -model can generate reward scores based on arbitrary human-defined principles. -By merely adjusting these principles during the RL training phase, we gain full -control over the preferences with the reward model, subsequently influencing -the behavior of the RL-trained policies, and eliminating the reliance on the -collection of online human preferences. Applying our method to the LLaMA-2-70b -base language model, we developed an AI assistant named Dromedary-2. With only -6 exemplars for in-context learning and 31 human-defined principles, -Dromedary-2 significantly surpasses the performance of several state-of-the-art -AI systems, including LLaMA-2-Chat-70b, on various benchmark datasets. We have -open-sourced the code and model weights to encourage further research into -aligning LLM-based AI agents with enhanced supervision efficiency, improved -controllability, and scalable oversight. - ---------------- - -### 05 Jul 2021 | [The MineRL BASALT Competition on Learning from Human Feedback](https://arxiv.org/abs/2107.01969) | [⬇️](https://arxiv.org/pdf/2107.01969) -*Rohin Shah, Cody Wild, Steven H. Wang, Neel Alex, Brandon Houghton, William Guss, Sharada Mohanty, Anssi Kanervisto, Stephanie Milani, Nicholay Topin, Pieter Abbeel, Stuart Russell, Anca Dragan* - - The last decade has seen a significant increase of interest in deep learning -research, with many public successes that have demonstrated its potential. As -such, these systems are now being incorporated into commercial products. With -this comes an additional challenge: how can we build AI systems that solve -tasks where there is not a crisp, well-defined specification? While multiple -solutions have been proposed, in this competition we focus on one in -particular: learning from human feedback. Rather than training AI systems using -a predefined reward function or using a labeled dataset with a predefined set -of categories, we instead train the AI system using a learning signal derived -from some form of human feedback, which can evolve over time as the -understanding of the task changes, or as the capabilities of the AI system -improve. - The MineRL BASALT competition aims to spur forward research on this important -class of techniques. We design a suite of four tasks in Minecraft for which we -expect it will be hard to write down hardcoded reward functions. These tasks -are defined by a paragraph of natural language: for example, "create a -waterfall and take a scenic picture of it", with additional clarifying details. -Participants must train a separate agent for each task, using any method they -want. Agents are then evaluated by humans who have read the task description. -To help participants get started, we provide a dataset of human demonstrations -on each of the four tasks, as well as an imitation learning baseline that -leverages these demonstrations. - Our hope is that this competition will improve our ability to build AI -systems that do what their designers intend them to do, even when the intent -cannot be easily formalized. Besides allowing AI to solve more tasks, this can -also enable more effective regulation of AI systems, as well as making progress -on the value alignment problem. - ---------------- - -### 07 Feb 2024 | [CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay](https://arxiv.org/abs/2402.04858) | [⬇️](https://arxiv.org/pdf/2402.04858) -*Natasha Butt, Blazej Manczak, Auke Wiggers, Corrado Rainone, David Zhang, Micha\"el Defferrard, Taco Cohen* - - Large language models are increasingly solving tasks that are commonly -believed to require human-level reasoning ability. However, these models still -perform very poorly on benchmarks of general intelligence such as the -Abstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a -programming-by-examples problem, and introduce a novel and scalable method for -language model self-improvement called Code Iteration (CodeIt). Our method -iterates between 1) program sampling and hindsight relabeling, and 2) learning -from prioritized experience replay. By relabeling the goal of an episode (i.e., -the target program output given input) to the realized output produced by the -sampled program, our method effectively deals with the extreme sparsity of -rewards in program synthesis. Applying CodeIt to the ARC dataset, we -demonstrate that prioritized hindsight replay, along with pre-training and -data-augmentation, leads to successful inter-task generalization. CodeIt is the -first neuro-symbolic approach that scales to the full ARC evaluation dataset. -Our method solves 15% of ARC evaluation tasks, achieving state-of-the-art -performance and outperforming existing neural and symbolic baselines. - ---------------- - -### 26 Oct 2023 | [Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models](https://arxiv.org/abs/2310.17567) | [⬇️](https://arxiv.org/pdf/2310.17567) -*Dingli Yu, Simran Kaur, Arushi Gupta, Jonah Brown-Cohen, Anirudh Goyal, Sanjeev Arora* - - With LLMs shifting their role from statistical modeling of language to -serving as general-purpose AI agents, how should LLM evaluations change? -Arguably, a key ability of an AI agent is to flexibly combine, as needed, the -basic skills it has learned. The capability to combine skills plays an -important role in (human) pedagogy and also in a paper on emergence phenomena -(Arora & Goyal, 2023). - This work introduces Skill-Mix, a new evaluation to measure ability to -combine skills. Using a list of $N$ skills the evaluator repeatedly picks -random subsets of $k$ skills and asks the LLM to produce text combining that -subset of skills. Since the number of subsets grows like $N^k$, for even modest -$k$ this evaluation will, with high probability, require the LLM to produce -text significantly different from any text in the training set. The paper -develops a methodology for (a) designing and administering such an evaluation, -and (b) automatic grading (plus spot-checking by humans) of the results using -GPT-4 as well as the open LLaMA-2 70B model. - Administering a version of to popular chatbots gave results that, while -generally in line with prior expectations, contained surprises. Sizeable -differences exist among model capabilities that are not captured by their -ranking on popular LLM leaderboards ("cramming for the leaderboard"). -Furthermore, simple probability calculations indicate that GPT-4's reasonable -performance on $k=5$ is suggestive of going beyond "stochastic parrot" behavior -(Bender et al., 2021), i.e., it combines skills in ways that it had not seen -during training. - We sketch how the methodology can lead to a Skill-Mix based eco-system of -open evaluations for AI capabilities of future models. - ---------------- - -### 14 Sep 2023 | [Masked Diffusion with Task-awareness for Procedure Planning in Instructional Videos](https://arxiv.org/abs/2309.07409) | [⬇️](https://arxiv.org/pdf/2309.07409) -*Fen Fang, Yun Liu, Ali Koksal, Qianli Xu, Joo-Hwee Lim* - - A key challenge with procedure planning in instructional videos lies in how -to handle a large decision space consisting of a multitude of action types that -belong to various tasks. To understand real-world video content, an AI agent -must proficiently discern these action types (e.g., pour milk, pour water, open -lid, close lid, etc.) based on brief visual observation. Moreover, it must -adeptly capture the intricate semantic relation of the action types and task -goals, along with the variable action sequences. Recently, notable progress has -been made via the integration of diffusion models and visual representation -learning to address the challenge. However, existing models employ rudimentary -mechanisms to utilize task information to manage the decision space. To -overcome this limitation, we introduce a simple yet effective enhancement - a -masked diffusion model. The introduced mask acts akin to a task-oriented -attention filter, enabling the diffusion/denoising process to concentrate on a -subset of action types. Furthermore, to bolster the accuracy of task -classification, we harness more potent visual representation learning -techniques. In particular, we learn a joint visual-text embedding, where a text -embedding is generated by prompting a pre-trained vision-language model to -focus on human actions. We evaluate the method on three public datasets and -achieve state-of-the-art performance on multiple metrics. Code is available at -https://github.com/ffzzy840304/Masked-PDPP. - ---------------- - -### 07 Nov 2023 | [Selective Visual Representations Improve Convergence and Generalization for Embodied AI](https://arxiv.org/abs/2311.04193) | [⬇️](https://arxiv.org/pdf/2311.04193) -*Ainaz Eftekhar, Kuo-Hao Zeng, Jiafei Duan, Ali Farhadi, Ani Kembhavi, Ranjay Krishna* - - Embodied AI models often employ off the shelf vision backbones like CLIP to -encode their visual observations. Although such general purpose representations -encode rich syntactic and semantic information about the scene, much of this -information is often irrelevant to the specific task at hand. This introduces -noise within the learning process and distracts the agent's focus from -task-relevant visual cues. Inspired by selective attention in humans-the -process through which people filter their perception based on their -experiences, knowledge, and the task at hand-we introduce a parameter-efficient -approach to filter visual stimuli for embodied AI. Our approach induces a -task-conditioned bottleneck using a small learnable codebook module. This -codebook is trained jointly to optimize task reward and acts as a -task-conditioned selective filter over the visual observation. Our experiments -showcase state-of-the-art performance for object goal navigation and object -displacement across 5 benchmarks, ProcTHOR, ArchitecTHOR, RoboTHOR, AI2-iTHOR, -and ManipulaTHOR. The filtered representations produced by the codebook are -also able generalize better and converge faster when adapted to other -simulation environments such as Habitat. Our qualitative analyses show that -agents explore their environments more effectively and their representations -retain task-relevant information like target object recognition while ignoring -superfluous information about other objects. Code and pretrained models are -available at our project website: https://embodied-codebook.github.io. - ---------------- - -### 16 Feb 2023 | [Foundation Models for Natural Language Processing -- Pre-trained Language Models Integrating Media](https://arxiv.org/abs/2302.08575) | [⬇️](https://arxiv.org/pdf/2302.08575) -*Gerhard Paa{\ss} and Sven Giesselbach* - - This open access book provides a comprehensive overview of the state of the -art in research and applications of Foundation Models and is intended for -readers familiar with basic Natural Language Processing (NLP) concepts. Over -the recent years, a revolutionary new paradigm has been developed for training -models for NLP. These models are first pre-trained on large collections of text -documents to acquire general syntactic knowledge and semantic information. -Then, they are fine-tuned for specific tasks, which they can often solve with -superhuman accuracy. When the models are large enough, they can be instructed -by prompts to solve new tasks without any fine-tuning. Moreover, they can be -applied to a wide range of different media and problem domains, ranging from -image and video processing to robot control learning. Because they provide a -blueprint for solving many tasks in artificial intelligence, they have been -called Foundation Models. After a brief introduction to basic NLP models the -main pre-trained language models BERT, GPT and sequence-to-sequence transformer -are described, as well as the concepts of self-attention and context-sensitive -embedding. Then, different approaches to improving these models are discussed, -such as expanding the pre-training criteria, increasing the length of input -texts, or including extra knowledge. An overview of the best-performing models -for about twenty application areas is then presented, e.g., question answering, -translation, story generation, dialog systems, generating images from text, -etc. For each application area, the strengths and weaknesses of current models -are discussed, and an outlook on further developments is given. In addition, -links are provided to freely available program code. A concluding chapter -summarizes the economic opportunities, mitigation of risks, and potential -developments of AI. - ---------------- - -### 21 Dec 2023 | [Automating Human Tutor-Style Programming Feedback: Leveraging GPT-4 Tutor Model for Hint Generation and GPT-3.5 Student Model for Hint Validation](https://arxiv.org/abs/2310.03780) | [⬇️](https://arxiv.org/pdf/2310.03780) -*Tung Phung, Victor-Alexandru P\u{a}durean, Anjali Singh, Christopher Brooks, Jos\'e Cambronero, Sumit Gulwani, Adish Singla, Gustavo Soares* - - Generative AI and large language models hold great promise in enhancing -programming education by automatically generating individualized feedback for -students. We investigate the role of generative AI models in providing human -tutor-style programming hints to help students resolve errors in their buggy -programs. Recent works have benchmarked state-of-the-art models for various -feedback generation scenarios; however, their overall quality is still inferior -to human tutors and not yet ready for real-world deployment. In this paper, we -seek to push the limits of generative AI models toward providing high-quality -programming hints and develop a novel technique, GPT4Hints-GPT3.5Val. As a -first step, our technique leverages GPT-4 as a ``tutor'' model to generate -hints -- it boosts the generative quality by using symbolic information of -failing test cases and fixes in prompts. As a next step, our technique -leverages GPT-3.5, a weaker model, as a ``student'' model to further validate -the hint quality -- it performs an automatic quality validation by simulating -the potential utility of providing this feedback. We show the efficacy of our -technique via extensive evaluation using three real-world datasets of Python -programs covering a variety of concepts ranging from basic algorithms to -regular expressions and data analysis using pandas library. - ---------------- - -### 04 Feb 2024 | [STEVE-1: A Generative Model for Text-to-Behavior in Minecraft](https://arxiv.org/abs/2306.00937) | [⬇️](https://arxiv.org/pdf/2306.00937) -*Shalev Lifshitz, Keiran Paster, Harris Chan, Jimmy Ba, Sheila McIlraith* - - Constructing AI models that respond to text instructions is challenging, -especially for sequential decision-making tasks. This work introduces a -methodology, inspired by unCLIP, for instruction-tuning generative models of -behavior without relying on a large dataset of instruction-labeled -trajectories. Using this methodology, we create an instruction-tuned Video -Pretraining (VPT) model called STEVE-1, which can follow short-horizon -open-ended text and visual instructions in Minecraft. STEVE-1 is trained in two -steps: adapting the pretrained VPT model to follow commands in MineCLIP's -latent space, then training a prior to predict latent codes from text. This -allows us to finetune VPT through self-supervised behavioral cloning and -hindsight relabeling, reducing the need for costly human text annotations, and -all for only $60 of compute. By leveraging pretrained models like VPT and -MineCLIP and employing best practices from text-conditioned image generation, -STEVE-1 sets a new bar for open-ended instruction-following in Minecraft with -low-level controls (mouse and keyboard) and raw pixel inputs, far outperforming -previous baselines and robustly completing 12 of 13 tasks in our early-game -evaluation suite. We provide experimental evidence highlighting key factors for -downstream performance, including pretraining, classifier-free guidance, and -data scaling. All resources, including our model weights, training scripts, and -evaluation tools are made available for further research. - ---------------- - -### 20 May 2021 | [Data-Efficient Reinforcement Learning with Self-Predictive Representations](https://arxiv.org/abs/2007.05929) | [⬇️](https://arxiv.org/pdf/2007.05929) -*Max Schwarzer, Ankesh Anand, Rishab Goel, R Devon Hjelm, Aaron Courville, Philip Bachman* - - While deep reinforcement learning excels at solving tasks where large amounts -of data can be collected through virtually unlimited interaction with the -environment, learning from limited interaction remains a key challenge. We -posit that an agent can learn more efficiently if we augment reward -maximization with self-supervised objectives based on structure in its visual -input and sequential interaction with the environment. Our method, -Self-Predictive Representations(SPR), trains an agent to predict its own latent -state representations multiple steps into the future. We compute target -representations for future states using an encoder which is an exponential -moving average of the agent's parameters and we make predictions using a -learned transition model. On its own, this future prediction objective -outperforms prior methods for sample-efficient deep RL from pixels. We further -improve performance by adding data augmentation to the future prediction loss, -which forces the agent's representations to be consistent across multiple views -of an observation. Our full self-supervised objective, which combines future -prediction and data augmentation, achieves a median human-normalized score of -0.415 on Atari in a setting limited to 100k steps of environment interaction, -which represents a 55% relative improvement over the previous state-of-the-art. -Notably, even in this limited data regime, SPR exceeds expert human scores on 7 -out of 26 games. The code associated with this work is available at -https://github.com/mila-iqia/spr - ---------------- - -### 06 Jul 2022 | [Learning Invariant World State Representations with Predictive Coding](https://arxiv.org/abs/2207.02972) | [⬇️](https://arxiv.org/pdf/2207.02972) -*Avi Ziskind, Sujeong Kim, and Giedrius T. Burachas* - - Self-supervised learning methods overcome the key bottleneck for building -more capable AI: limited availability of labeled data. However, one of the -drawbacks of self-supervised architectures is that the representations that -they learn are implicit and it is hard to extract meaningful information about -the encoded world states, such as 3D structure of the visual scene encoded in a -depth map. Moreover, in the visual domain such representations only rarely -undergo evaluations that may be critical for downstream tasks, such as vision -for autonomous cars. Herein, we propose a framework for evaluating visual -representations for illumination invariance in the context of depth perception. -We develop a new predictive coding-based architecture and a hybrid -fully-supervised/self-supervised learning method. We propose a novel -architecture that extends the predictive coding approach: PRedictive Lateral -bottom-Up and top-Down Encoder-decoder Network (PreludeNet), which explicitly -learns to infer and predict depth from video frames. In PreludeNet, the -encoder's stack of predictive coding layers is trained in a self-supervised -manner, while the predictive decoder is trained in a supervised manner to infer -or predict the depth. We evaluate the robustness of our model on a new -synthetic dataset, in which lighting conditions (such as overall illumination, -and effect of shadows) can be be parametrically adjusted while keeping all -other aspects of the world constant. PreludeNet achieves both competitive depth -inference performance and next frame prediction accuracy. We also show how this -new network architecture, coupled with the hybrid -fully-supervised/self-supervised learning method, achieves balance between the -said performance and invariance to changes in lighting. The proposed framework -for evaluating visual representations can be extended to diverse task domains -and invariance tests. - ---------------- - -### 10 Nov 2023 | [State2Explanation: Concept-Based Explanations to Benefit Agent Learning and User Understanding](https://arxiv.org/abs/2309.12482) | [⬇️](https://arxiv.org/pdf/2309.12482) -*Devleena Das, Sonia Chernova, Been Kim* - - As more non-AI experts use complex AI systems for daily tasks, there has been -an increasing effort to develop methods that produce explanations of AI -decision making that are understandable by non-AI experts. Towards this effort, -leveraging higher-level concepts and producing concept-based explanations have -become a popular method. Most concept-based explanations have been developed -for classification techniques, and we posit that the few existing methods for -sequential decision making are limited in scope. In this work, we first -contribute a desiderata for defining concepts in sequential decision making -settings. Additionally, inspired by the Protege Effect which states explaining -knowledge often reinforces one's self-learning, we explore how concept-based -explanations of an RL agent's decision making can in turn improve the agent's -learning rate, as well as improve end-user understanding of the agent's -decision making. To this end, we contribute a unified framework, -State2Explanation (S2E), that involves learning a joint embedding model between -state-action pairs and concept-based explanations, and leveraging such learned -model to both (1) inform reward shaping during an agent's training, and (2) -provide explanations to end-users at deployment for improved task performance. -Our experimental validations, in Connect 4 and Lunar Lander, demonstrate the -success of S2E in providing a dual-benefit, successfully informing reward -shaping and improving agent learning rate, as well as significantly improving -end user task performance at deployment time. - ---------------- - -### 17 May 2023 | [LeTI: Learning to Generate from Textual Interactions](https://arxiv.org/abs/2305.10314) | [⬇️](https://arxiv.org/pdf/2305.10314) -*Xingyao Wang, Hao Peng, Reyhaneh Jabbarvand, Heng Ji* - - Finetuning pre-trained language models (LMs) enhances the models' -capabilities. Prior techniques fine-tune a pre-trained LM on input-output pairs -(e.g., instruction fine-tuning), or with numerical rewards that gauge the -quality of its outputs (e.g., reinforcement learning from human feedback). We -explore LMs' potential to learn from textual interactions (LeTI) that not only -check their correctness with binary labels, but also pinpoint and explain -errors in their outputs through textual feedback. Our investigation focuses on -the code generation task, where the model produces code pieces in response to -natural language instructions. This setting invites a natural and scalable way -to acquire the textual feedback: the error messages and stack traces from code -execution using a Python interpreter. LeTI iteratively fine-tunes the model, -using the LM objective, on a concatenation of natural language instructions, -LM-generated programs, and textual feedback, which is only provided when the -generated program fails to solve the task. Prepended to this fine-tuning text, -a binary reward token is used to differentiate correct and buggy solutions. On -MBPP, a code generation dataset, LeTI substantially improves the performance of -two base LMs of different scales. LeTI requires no ground-truth outputs for -training and even outperforms a fine-tuned baseline that does. LeTI's strong -performance generalizes to other datasets. Trained on MBPP, it achieves -comparable or better performance than the base LMs on unseen problems in -HumanEval. Furthermore, compared to binary feedback, we observe that textual -feedback leads to improved generation quality and sample efficiency, achieving -the same performance with fewer than half of the gradient steps. LeTI is -equally applicable in natural language tasks when they can be formulated as -code generation, which we empirically verified on event argument extraction. - ---------------- - -### 01 Dec 2022 | [A General Purpose Supervisory Signal for Embodied Agents](https://arxiv.org/abs/2212.01186) | [⬇️](https://arxiv.org/pdf/2212.01186) -*Kunal Pratap Singh, Jordi Salvador, Luca Weihs, Aniruddha Kembhavi* - - Training effective embodied AI agents often involves manual reward -engineering, expert imitation, specialized components such as maps, or -leveraging additional sensors for depth and localization. Another approach is -to use neural architectures alongside self-supervised objectives which -encourage better representation learning. In practice, there are few guarantees -that these self-supervised objectives encode task-relevant information. We -propose the Scene Graph Contrastive (SGC) loss, which uses scene graphs as -general-purpose, training-only, supervisory signals. The SGC loss does away -with explicit graph decoding and instead uses contrastive learning to align an -agent's representation with a rich graphical encoding of its environment. The -SGC loss is generally applicable, simple to implement, and encourages -representations that encode objects' semantics, relationships, and history. -Using the SGC loss, we attain significant gains on three embodied tasks: Object -Navigation, Multi-Object Navigation, and Arm Point Navigation. Finally, we -present studies and analyses which demonstrate the ability of our trained -representation to encode semantic cues about the environment. - ---------------- - -### 16 May 2023 | [RAMario: Experimental Approach to Reptile Algorithm -- Reinforcement Learning for Mario](https://arxiv.org/abs/2305.09655) | [⬇️](https://arxiv.org/pdf/2305.09655) -*Sanyam Jain* - - This research paper presents an experimental approach to using the Reptile -algorithm for reinforcement learning to train a neural network to play Super -Mario Bros. We implement the Reptile algorithm using the Super Mario Bros Gym -library and TensorFlow in Python, creating a neural network model with a single -convolutional layer, a flatten layer, and a dense layer. We define the -optimizer and use the Reptile class to create an instance of the Reptile -meta-learning algorithm. We train the model using multiple tasks and episodes, -choosing actions using the current weights of the neural network model, taking -those actions in the environment, and updating the model weights using the -Reptile algorithm. We evaluate the performance of the algorithm by printing the -total reward for each episode. In addition, we compare the performance of the -Reptile algorithm approach to two other popular reinforcement learning -algorithms, Proximal Policy Optimization (PPO) and Deep Q-Network (DQN), -applied to the same Super Mario Bros task. Our results demonstrate that the -Reptile algorithm provides a promising approach to few-shot learning in video -game AI, with comparable or even better performance than the other two -algorithms, particularly in terms of moves vs distance that agent performs for -1M episodes of training. The results shows that best total distance for world -1-2 in the game environment were ~1732 (PPO), ~1840 (DQN) and ~2300 (RAMario). -Full code is available at https://github.com/s4nyam/RAMario. - ---------------- - -### 31 Mar 2023 | [Pair Programming with Large Language Models for Sampling and Estimation of Copulas](https://arxiv.org/abs/2303.18116) | [⬇️](https://arxiv.org/pdf/2303.18116) -*Jan G\'orecki* - - Without writing a single line of code by a human, an example Monte Carlo -simulation based application for stochastic dependence modeling with copulas is -developed using a state-of-the-art large language model (LLM) fine-tuned for -conversations. This includes interaction with ChatGPT in natural language and -using mathematical formalism, which, under careful supervision by a -human-expert, led to producing a working code in MATLAB, Python and R for -sampling from a given copula model, evaluation of the model's density, -performing maximum likelihood estimation, optimizing the code for parallel -computing for CPUs as well as for GPUs, and visualization of the computed -results. In contrast to other emerging studies that assess the accuracy of LLMs -like ChatGPT on tasks from a selected area, this work rather investigates ways -how to achieve a successful solution of a standard statistical task in a -collaboration of a human-expert and artificial intelligence (AI). Particularly, -through careful prompt engineering, we separate successful solutions generated -by ChatGPT from unsuccessful ones, resulting in a comprehensive list of related -pros and cons. It is demonstrated that if the typical pitfalls are avoided, we -can substantially benefit from collaborating with an AI partner. For example, -we show that if ChatGPT is not able to provide a correct solution due to a lack -of or incorrect knowledge, the human-expert can feed it with the correct -knowledge, e.g., in the form of mathematical theorems and formulas, and make it -to apply the gained knowledge in order to provide a solution that is correct. -Such ability presents an attractive opportunity to achieve a programmed -solution even for users with rather limited knowledge of programming -techniques. - ---------------- - -### 28 Jun 2023 | [AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn](https://arxiv.org/abs/2306.08640) | [⬇️](https://arxiv.org/pdf/2306.08640) -*Difei Gao, Lei Ji, Luowei Zhou, Kevin Qinghong Lin, Joya Chen, Zihan Fan, Mike Zheng Shou* - - Recent research on Large Language Models (LLMs) has led to remarkable -advancements in general NLP AI assistants. Some studies have further explored -the use of LLMs for planning and invoking models or APIs to address more -general multi-modal user queries. Despite this progress, complex visual-based -tasks still remain challenging due to the diverse nature of visual tasks. This -diversity is reflected in two aspects: 1) Reasoning paths. For many real-life -applications, it is hard to accurately decompose a query simply by examining -the query itself. Planning based on the specific visual content and the results -of each step is usually required. 2) Flexible inputs and intermediate results. -Input forms could be flexible for in-the-wild cases, and involves not only a -single image or video but a mixture of videos and images, e.g., a user-view -image with some reference videos. Besides, a complex reasoning process will -also generate diverse multimodal intermediate results, e.g., video narrations, -segmented video clips, etc. To address such general cases, we propose a -multi-modal AI assistant, AssistGPT, with an interleaved code and language -reasoning approach called Plan, Execute, Inspect, and Learn (PEIL) to integrate -LLMs with various tools. Specifically, the Planner is capable of using natural -language to plan which tool in Executor should do next based on the current -reasoning progress. Inspector is an efficient memory manager to assist the -Planner to feed proper visual information into a specific tool. Finally, since -the entire reasoning process is complex and flexible, a Learner is designed to -enable the model to autonomously explore and discover the optimal solution. We -conducted experiments on A-OKVQA and NExT-QA benchmarks, achieving -state-of-the-art results. Moreover, showcases demonstrate the ability of our -system to handle questions far more complex than those found in the benchmarks. - ---------------- - -### 06 Nov 2019 | [Feedback-Based Self-Learning in Large-Scale Conversational AI Agents](https://arxiv.org/abs/1911.02557) | [⬇️](https://arxiv.org/pdf/1911.02557) -*Pragaash Ponnusamy, Alireza Roshan Ghias, Chenlei Guo, Ruhi Sarikaya* - - Today, most large-scale conversational AI agents (e.g. Alexa, Siri, or Google -Assistant) are built using manually annotated data to train the different -components of the system. Typically, the accuracy of the ML models in these -components are improved by manually transcribing and annotating data. As the -scope of these systems increase to cover more scenarios and domains, manual -annotation to improve the accuracy of these components becomes prohibitively -costly and time consuming. In this paper, we propose a system that leverages -user-system interaction feedback signals to automate learning without any -manual annotation. Users here tend to modify a previous query in hopes of -fixing an error in the previous turn to get the right results. These -reformulations, which are often preceded by defective experiences caused by -errors in ASR, NLU, ER or the application. In some cases, users may not -properly formulate their requests (e.g. providing partial title of a song), but -gleaning across a wider pool of users and sessions reveals the underlying -recurrent patterns. Our proposed self-learning system automatically detects the -errors, generate reformulations and deploys fixes to the runtime system to -correct different types of errors occurring in different components of the -system. In particular, we propose leveraging an absorbing Markov Chain model as -a collaborative filtering mechanism in a novel attempt to mine these patterns. -We show that our approach is highly scalable, and able to learn reformulations -that reduce Alexa-user errors by pooling anonymized data across millions of -customers. The proposed self-learning system achieves a win/loss ratio of 11.8 -and effectively reduces the defect rate by more than 30% on utterance level -reformulations in our production A/B tests. To the best of our knowledge, this -is the first self-learning large-scale conversational AI system in production. - ---------------- - - -# 2. - - -🚀 CoT allows AI models to break down complex problems into a series of intermediate steps, improving reasoning and performance. :thought_balloon: - -🧠 Techniques like self-consistency and iterative refinement help AI models revise and refine their thought processes. :repeat: - -⚡ CoT can be applied to various domains, including Telemed (e.g., differential diagnosis) and AI development (e.g., code generation). :computer: - -### Keywords and Glossary - -1. ⭐ Chain of Thought (CoT) -2. 🧩 Reasoning -3. 🔄 Iterative Refinement -4. 🧠 Self-Consistency -5. 💻 Code Generation -6. 🩺 Differential Diagnosis - -```mermaid -graph TD - A[Chain of Thought] --> B[Reasoning] - A --> C[Iterative Refinement] - A --> D[Self-Consistency] - B --> E[Code Generation] - B --> F[Differential Diagnosis] -``` - - -```python -# app.py -import streamlit as st - -st.title("Chain of Thought (CoT)") - -st.header("State-of-the-Art Techniques") -st.write("🚀 CoT allows AI models to break down complex problems into a series of intermediate steps, improving reasoning and performance. :thought_balloon:") -st.write("🧠 Techniques like self-consistency and iterative refinement help AI models revise and refine their thought processes. :repeat:") -st.write("⚡ CoT can be applied to various domains, including Telemed (e.g., differential diagnosis) and AI development (e.g., code generation). :computer:") - -st.header("Keywords and Glossary") -st.write("1. ⭐ Chain of Thought (CoT)") -st.write("2. 🧩 Reasoning") -st.write("3. 🔄 Iterative Refinement") -st.write("4. 🧠 Self-Consistency") -st.write("5. 💻 Code Generation") -st.write("6. 🩺 Differential Diagnosis") - -st.mermaid(""" -graph TD - A[Chain of Thought] --> B[Reasoning] - A --> C[Iterative Refinement] - A --> D[Self-Consistency] - B --> E[Code Generation] - B --> F[Differential Diagnosis] -""") -``` - - -# 🩺🔍 Search Results -### 07 Sep 2023 | [Structured Chain-of-Thought Prompting for Code Generation](https://arxiv.org/abs/2305.06599) | [⬇️](https://arxiv.org/pdf/2305.06599) -*Jia Li, Ge Li, Yongmin Li, Zhi Jin* - - Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive -performance in code generation. LLMs take prompts as inputs, and -Chain-of-Thought (CoT) prompting is the state-of-the-art prompting technique. -CoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural -language reasoning steps) and then output the code. However, CoT prompting is -designed for natural language generation and has low accuracy in code -generation. - In this paper, we propose Structured CoTs (SCoTs) and present a novel -prompting technique for code generation, named SCoT prompting. Our motivation -is source code contains rich structural information and any code can be -composed of three program structures (i.e., sequence, branch, and loop -structures). Intuitively, structured intermediate reasoning steps make for -structured source code. Thus, we ask LLMs to use program structures to build -CoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs. -Compared to CoT prompting, SCoT prompting explicitly constrains LLMs to think -about how to solve requirements from the view of source code and further the -performance of LLMs in code generation. We apply SCoT prompting to two LLMs -(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval, -MBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline -- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human -developers prefer programs from SCoT prompting. (3) SCoT prompting is robust to -examples and achieves substantial improvements. - ---------------- - -### 15 Nov 2023 | [Eliminating Reasoning via Inferring with Planning: A New Framework to Guide LLMs' Non-linear Thinking](https://arxiv.org/abs/2310.12342) | [⬇️](https://arxiv.org/pdf/2310.12342) -*Yongqi Tong, Yifan Wang, Dawei Li, Sizhe Wang, Zi Lin, Simeng Han, Jingbo Shang* - - Chain-of-Thought(CoT) prompting and its variants explore equipping large -language models (LLMs) with high-level reasoning abilities by emulating -human-like linear cognition and logic. However, the human mind is complicated -and mixed with both linear and nonlinear thinking. In this work, we propose -\textbf{I}nferential \textbf{E}xclusion \textbf{P}rompting (IEP), a novel -prompting that combines the principles of elimination and inference in order to -guide LLMs to think non-linearly. IEP guides LLMs to plan and then utilize -Natural Language Inference (NLI) to deduce each possible solution's entailment -relation with context, commonsense, or facts, therefore yielding a broader -perspective by thinking back for inferring. This forward planning and backward -eliminating process allows IEP to better simulate the complex human thinking -processes compared to other CoT-based methods, which only reflect linear -cognitive processes. We conducted a series of empirical studies and have -corroborated that IEP consistently outperforms CoT across various tasks. -Additionally, we observe that integrating IEP and CoT further improves the -LLMs' performance on certain tasks, highlighting the necessity of equipping -LLMs with mixed logic processes. Moreover, to better evaluate comprehensive -features inherent in human logic, we introduce \textbf{M}ental-\textbf{A}bility -\textbf{R}easoning \textbf{B}enchmark (MARB). The benchmark comprises six novel -subtasks with a total of 9,115 questions, among which 1,685 are developed with -hand-crafted rationale references. We believe both \textsc{IEP} and -\textsc{MARB} can serve as a promising direction for unveiling LLMs' logic and -verbal reasoning abilities and drive further advancements. \textsc{MARB} will -be available at ~\texttt{anonymity link} soon. - ---------------- - -### 04 Jun 2023 | [Evaluating and Improving Tool-Augmented Computation-Intensive Math Reasoning](https://arxiv.org/abs/2306.02408) | [⬇️](https://arxiv.org/pdf/2306.02408) -*Beichen Zhang, Kun Zhou, Xilin Wei, Wayne Xin Zhao, Jing Sha, Shijin Wang, Ji-Rong Wen* - - Chain-of-thought prompting~(CoT) and tool augmentation have been validated in -recent work as effective practices for improving large language models~(LLMs) -to perform step-by-step reasoning on complex math-related tasks. However, most -existing math reasoning datasets may be not able to fully evaluate and analyze -the ability of LLMs in manipulating tools and performing reasoning, as they may -only require very few invocations of tools or miss annotations for evaluating -intermediate reasoning steps. To address the issue, we construct \textbf{CARP}, -a new Chinese dataset consisting of 4,886 computation-intensive algebra -problems with formulated annotations on intermediate steps. In CARP, we test -four LLMs with CoT prompting, and find that they are all prone to make mistakes -at the early steps of the solution, leading to wrong answers. Based on this -finding, we propose a new approach that can deliberate the reasoning steps with -tool interfaces, namely \textbf{DELI}. In DELI, we first initialize a -step-by-step solution based on retrieved exemplars, then iterate two -deliberation procedures that check and refine the intermediate steps of the -generated solution, from the perspectives of tool manipulation and natural -language reasoning, until obtaining converged solutions or reaching the maximum -turn. Experimental results on CARP and six other datasets show that the -proposed DELI mostly outperforms competitive baselines, and can further boost -the performance of existing CoT methods. Our data and code are available in -\url{https://github.com/RUCAIBox/CARP}. - ---------------- - -### 23 Oct 2023 | [Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks](https://arxiv.org/abs/2211.12588) | [⬇️](https://arxiv.org/pdf/2211.12588) -*Wenhu Chen, Xueguang Ma, Xinyi Wang, William W. Cohen* - - Recently, there has been significant progress in teaching language models to -perform step-by-step reasoning to solve complex numerical reasoning tasks. -Chain-of-thoughts prompting (CoT) is by far the state-of-art method for these -tasks. CoT uses language models to perform both reasoning and computation in -the multi-step `thought' process. To disentangle computation from reasoning, we -propose `Program of Thoughts' (PoT), which uses language models (mainly Codex) -to express the reasoning process as a program. The computation is relegated to -an external computer, which executes the generated programs to derive the -answer. We evaluate PoT on five math word problem datasets (GSM, AQuA, SVAMP, -TabMWP, MultiArith) and three financial-QA datasets (FinQA, ConvFinQA, TATQA) -for both few-shot and zero-shot setups. Under both few-shot and zero-shot -settings, PoT can show an average performance gain over CoT by around 12\% -across all the evaluated datasets. By combining PoT with self-consistency -decoding, we can achieve SoTA performance on all math problem datasets and -near-SoTA performance on financial datasets. All of our data and code are -released in Github https://github.com/wenhuchen/Program-of-Thoughts - ---------------- - -### 04 Jan 2024 | [Text2MDT: Extracting Medical Decision Trees from Medical Texts](https://arxiv.org/abs/2401.02034) | [⬇️](https://arxiv.org/pdf/2401.02034) -*Wei Zhu and Wenfeng Li and Xing Tian and Pengfei Wang and Xiaoling Wang and Jin Chen and Yuanbin Wu and Yuan Ni and Guotong Xie* - - Knowledge of the medical decision process, which can be modeled as medical -decision trees (MDTs), is critical to build clinical decision support systems. -However, the current MDT construction methods rely heavily on time-consuming -and laborious manual annotation. In this work, we propose a novel task, -Text2MDT, to explore the automatic extraction of MDTs from medical texts such -as medical guidelines and textbooks. We normalize the form of the MDT and -create an annotated Text-to-MDT dataset in Chinese with the participation of -medical experts. We investigate two different methods for the Text2MDT tasks: -(a) an end-to-end framework which only relies on a GPT style large language -models (LLM) instruction tuning to generate all the node information and tree -structures. (b) The pipeline framework which decomposes the Text2MDT task to -three subtasks. Experiments on our Text2MDT dataset demonstrate that: (a) the -end-to-end method basd on LLMs (7B parameters or larger) show promising -results, and successfully outperform the pipeline methods. (b) The -chain-of-thought (COT) prompting method \cite{Wei2022ChainOT} can improve the -performance of the fine-tuned LLMs on the Text2MDT test set. (c) the -lightweight pipelined method based on encoder-based pretrained models can -perform comparably with LLMs with model complexity two magnititudes smaller. -Our Text2MDT dataset is open-sourced at -\url{https://tianchi.aliyun.com/dataset/95414}, and the source codes are -open-sourced at \url{https://github.com/michael-wzhu/text2dt}. - ---------------- - -### 06 Jan 2024 | [Quartet Logic: A Four-Step Reasoning (QLFR) framework for advancing Short Text Classification](https://arxiv.org/abs/2401.03158) | [⬇️](https://arxiv.org/pdf/2401.03158) -*Hui Wu, Yuanben Zhang, Zhonghe Han, Yingyan Hou, Lei Wang, Siye Liu, Qihang Gong and Yunping Ge* - - Short Text Classification (STC) is crucial for processing and comprehending -the brief but substantial content prevalent on contemporary digital platforms. -The STC encounters difficulties in grasping semantic and syntactic intricacies, -an issue that is apparent in traditional pre-trained language models. Although -Graph Convolutional Networks enhance performance by integrating external -knowledge bases, these methods are limited by the quality and extent of the -knowledge applied. Recently, the emergence of Large Language Models (LLMs) and -Chain-of-Thought (CoT) has significantly improved the performance of complex -reasoning tasks. However, some studies have highlighted the limitations of -their application in fundamental NLP tasks. Consequently, this study sought to -employ CoT to investigate the capabilities of LLMs in STC tasks. This study -introduces Quartet Logic: A Four-Step Reasoning (QLFR) framework. This -framework primarily incorporates Syntactic and Semantic Enrichment CoT, -effectively decomposing the STC task into four distinct steps: (i) essential -concept identification, (ii) common-sense knowledge retrieval, (iii) text -rewriting, and (iv) classification. This elicits the inherent knowledge and -abilities of LLMs to address the challenges in STC. Surprisingly, we found that -QLFR can also improve the performance of smaller models. Therefore, we -developed a CoT-Driven Multi-task learning (QLFR-CML) method to facilitate the -knowledge transfer from LLMs to smaller models. Extensive experimentation -across six short-text benchmarks validated the efficacy of the proposed -methods. Notably, QLFR achieved state-of-the-art performance on all datasets, -with significant improvements, particularly on the Ohsumed and TagMyNews -datasets. - ---------------- - -### 23 Oct 2023 | [Reasoning with Language Model is Planning with World Model](https://arxiv.org/abs/2305.14992) | [⬇️](https://arxiv.org/pdf/2305.14992) -*Shibo Hao, Yi Gu, Haodi Ma, Joshua Jiahua Hong, Zhen Wang, Daisy Zhe Wang, Zhiting Hu* - - Large language models (LLMs) have shown remarkable reasoning capabilities, -especially when prompted to generate intermediate reasoning steps (e.g., -Chain-of-Thought, CoT). However, LLMs can still struggle with problems that are -easy for humans, such as generating action plans for executing tasks in a given -environment, or performing complex math, logical, and commonsense reasoning. -The deficiency stems from the key fact that LLMs lack an internal -$\textit{world model}$ to predict the world $\textit{state}$ (e.g., environment -status, intermediate variable values) and simulate long-term outcomes of -actions. This prevents LLMs from performing deliberate planning akin to human -brains, which involves exploring alternative reasoning paths, anticipating -future states and rewards, and iteratively refining existing reasoning steps. -To overcome the limitations, we propose a new LLM reasoning framework, -$\underline{R}$easoning vi$\underline{a}$ $\underline{P}$lanning -$\textbf{(RAP)}$. RAP repurposes the LLM as both a world model and a reasoning -agent, and incorporates a principled planning algorithm (based on Monto Carlo -Tree Search) for strategic exploration in the vast reasoning space. During -reasoning, the LLM (as agent) incrementally builds a reasoning tree under the -guidance of the LLM (as world model) and task-specific rewards, and obtains a -high-reward reasoning path efficiently with a proper balance between -exploration $\textit{vs.}$ exploitation. We apply RAP to a variety of -challenging reasoning problems including plan generation, math reasoning, and -logical inference. Empirical results on these tasks demonstrate the superiority -of RAP over various strong baselines, including CoT and least-to-most prompting -with self-consistency. RAP on LLAMA-33B surpasses CoT on GPT-4 with 33% -relative improvement in a plan generation setting. - ---------------- - -### 11 Jan 2024 | [Evidence to Generate (E2G): A Single-agent Two-step Prompting for Context Grounded and Retrieval Augmented Reasoning](https://arxiv.org/abs/2401.05787) | [⬇️](https://arxiv.org/pdf/2401.05787) -*Md Rizwan Parvez* - - While chain-of-thought (CoT) prompting has revolutionized how LLMs perform -reasoning tasks, its current methods and variations (e.g, Self-consistency, -ReACT, Reflexion, Tree-of-Thoughts (ToT), Cumulative Reasoning (CR)) suffer -from limitations like slowness, limited context grounding, hallucination and -inconsistent outputs. To overcome these challenges, we introduce Evidence to -Generate (E2G), a novel single-agent, two-step prompting framework. Instead of -unverified reasoning claims, this innovative approach leverages the power of -"evidence for decision making" by first focusing exclusively on the thought -sequences (the series of intermediate steps) explicitly mentioned in the -context which then serve as extracted evidence, guiding the LLM's output -generation process with greater precision and efficiency. This simple yet -powerful approach unlocks the true potential of chain-of-thought like -prompting, paving the way for faster, more reliable, and more contextually -aware reasoning in LLMs. \tool achieves remarkable results robustly across a -wide range of knowledge-intensive reasoning and generation tasks, surpassing -baseline approaches with state-of-the-art LLMs. For example, (i) on LogiQA -benchmark using GPT-4 as backbone model, \tool achieves a new state-of-the -Accuracy of 53.8% exceeding CoT by 18%, ToT by 11%, CR by 9% (ii) a variant of -E2G with PaLM2 outperforms the variable-shot performance of Gemini Ultra by 0.9 -F1 points, reaching an F1 score of 83.3 on a subset of DROP. - ---------------- - -### 06 Dec 2023 | [Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language Models with Creative Humor Generation](https://arxiv.org/abs/2312.02439) | [⬇️](https://arxiv.org/pdf/2312.02439) -*Shanshan Zhong, Zhongzhan Huang, Shanghua Gao, Wushao Wen, Liang Lin, Marinka Zitnik, Pan Zhou* - - Chain-of-Thought (CoT) guides large language models (LLMs) to reason -step-by-step, and can motivate their logical reasoning ability. While effective -for logical tasks, CoT is not conducive to creative problem-solving which often -requires out-of-box thoughts and is crucial for innovation advancements. In -this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a -non-sequential, creative paradigm involving strong associations and knowledge -leaps. To this end, we study LLMs on the popular Oogiri game which needs -participants to have good creativity and strong associative thinking for -responding unexpectedly and humorously to the given image, text, or both, and -thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the -Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset -which contains over 130,000 samples from the Oogiri game, and observe the -insufficient LoT ability or failures of most existing LLMs on the Oogiri game. -Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve -LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into -LoT-oriented instruction tuning data to train pretrained LLM for achieving -certain LoT humor generation and discrimination abilities. Then CLoT designs an -explorative self-refinement that encourages the LLM to generate more creative -LoT data via exploring parallels between seemingly unrelated concepts and -selects high-quality data to train itself for self-refinement. CLoT not only -excels in humor generation in the Oogiri game but also boosts creative -abilities in various tasks like cloud guessing game and divergent association -task. These findings advance our understanding and offer a pathway to improve -LLMs' creative capacities for innovative applications across domains. The -dataset, code, and models will be released online. -https://zhongshsh.github.io/CLoT/. - ---------------- - -### 03 Oct 2023 | [Deductive Verification of Chain-of-Thought Reasoning](https://arxiv.org/abs/2306.03872) | [⬇️](https://arxiv.org/pdf/2306.03872) -*Zhan Ling, Yunhao Fang, Xuanlin Li, Zhiao Huang, Mingu Lee, Roland Memisevic and Hao Su* - - Large Language Models (LLMs) significantly benefit from Chain-of-Thought -(CoT) prompting in performing various reasoning tasks. While CoT allows models -to produce more comprehensive reasoning processes, its emphasis on intermediate -reasoning steps can inadvertently introduce hallucinations and accumulated -errors, thereby limiting models' ability to solve complex reasoning tasks. -Inspired by how humans engage in careful and meticulous deductive logical -reasoning processes to solve tasks, we seek to enable language models to -perform explicit and rigorous deductive reasoning, and also ensure the -trustworthiness of their reasoning process through self-verification. However, -directly verifying the validity of an entire deductive reasoning process is -challenging, even with advanced models like ChatGPT. In light of this, we -propose to decompose a reasoning verification process into a series of -step-by-step subprocesses, each only receiving their necessary context and -premises. To facilitate this procedure, we propose Natural Program, a natural -language-based deductive reasoning format. Our approach enables models to -generate precise reasoning steps where subsequent steps are more rigorously -grounded on prior steps. It also empowers language models to carry out -reasoning self-verification in a step-by-step manner. By integrating this -verification process into each deductive reasoning stage, we significantly -enhance the rigor and trustfulness of generated reasoning steps. Along this -process, we also improve the answer correctness on complex reasoning tasks. -Code will be released at https://github.com/lz1oceani/verify_cot. - ---------------- - -### 25 May 2023 | [Code as Policies: Language Model Programs for Embodied Control](https://arxiv.org/abs/2209.07753) | [⬇️](https://arxiv.org/pdf/2209.07753) -*Jacky Liang, Wenlong Huang, Fei Xia, Peng Xu, Karol Hausman, Brian Ichter, Pete Florence, Andy Zeng* - - Large language models (LLMs) trained on code completion have been shown to be -capable of synthesizing simple Python programs from docstrings [1]. We find -that these code-writing LLMs can be re-purposed to write robot policy code, -given natural language commands. Specifically, policy code can express -functions or feedback loops that process perception outputs (e.g.,from object -detectors [2], [3]) and parameterize control primitive APIs. When provided as -input several example language commands (formatted as comments) followed by -corresponding policy code (via few-shot prompting), LLMs can take in new -commands and autonomously re-compose API calls to generate new policy code -respectively. By chaining classic logic structures and referencing third-party -libraries (e.g., NumPy, Shapely) to perform arithmetic, LLMs used in this way -can write robot policies that (i) exhibit spatial-geometric reasoning, (ii) -generalize to new instructions, and (iii) prescribe precise values (e.g., -velocities) to ambiguous descriptions ("faster") depending on context (i.e., -behavioral commonsense). This paper presents code as policies: a robot-centric -formulation of language model generated programs (LMPs) that can represent -reactive policies (e.g., impedance controllers), as well as waypoint-based -policies (vision-based pick and place, trajectory-based control), demonstrated -across multiple real robot platforms. Central to our approach is prompting -hierarchical code-gen (recursively defining undefined functions), which can -write more complex code and also improves state-of-the-art to solve 39.8% of -problems on the HumanEval [1] benchmark. Code and videos are available at -https://code-as-policies.github.io - ---------------- - -### 02 Feb 2024 | [Neuron Patching: Neuron-level Model Editing on Code Generation and LLMs](https://arxiv.org/abs/2312.05356) | [⬇️](https://arxiv.org/pdf/2312.05356) -*Jian Gu, Chunyang Chen, Aldeida Aleti* - - Large Language Models are successfully adopted in software engineering, -especially in code generation. Updating these models with new knowledge is very -expensive, and is often required to fully realize their value. In this paper, -we propose a novel and effective model editing approach, \textsc{MENT}, to -patch LLMs in coding tasks. Based on the mechanism of generative LLMs, -\textsc{MENT} enables model editing in next-token predictions, and further -supports common coding tasks. \textsc{MENT} is effective, efficient, and -reliable. It can correct a neural model by patching 1 or 2 neurons. As the -pioneer work on neuron-level model editing of generative models, we formalize -the editing process and introduce the involved concepts. Besides, we also -introduce new measures to evaluate its generalization ability, and build a -benchmark for further study. Our approach is evaluated on three coding tasks, -including API-seq recommendation, line-level code generation, and -pseudocode-to-code transaction. It outperforms the state-of-the-art by a -significant margin on both effectiveness and efficiency measures. In addition, -we demonstrate the usages of \textsc{MENT} for LLM reasoning in software -engineering. By editing the LLM knowledge with \textsc{MENT}, the directly or -indirectly dependent behaviors in the chain-of-thought change accordingly and -automatically. - ---------------- - -### 02 Mar 2024 | [DQ-LoRe: Dual Queries with Low Rank Approximation Re-ranking for In-Context Learning](https://arxiv.org/abs/2310.02954) | [⬇️](https://arxiv.org/pdf/2310.02954) -*Jing Xiong, Zixuan Li, Chuanyang Zheng, Zhijiang Guo, Yichun Yin, Enze Xie, Zhicheng Yang, Qingxing Cao, Haiming Wang, Xiongwei Han, Jing Tang, Chengming Li, Xiaodan Liang* - - Recent advances in natural language processing, primarily propelled by Large -Language Models (LLMs), have showcased their remarkable capabilities grounded -in in-context learning. A promising avenue for guiding LLMs in intricate -reasoning tasks involves the utilization of intermediate reasoning steps within -the Chain-of-Thought (CoT) paradigm. Nevertheless, the central challenge lies -in the effective selection of exemplars for facilitating in-context learning. -In this study, we introduce a framework that leverages Dual Queries and -Low-rank approximation Re-ranking (DQ-LoRe) to automatically select exemplars -for in-context learning. Dual Queries first query LLM to obtain LLM-generated -knowledge such as CoT, then query the retriever to obtain the final exemplars -via both question and the knowledge. Moreover, for the second query, LoRe -employs dimensionality reduction techniques to refine exemplar selection, -ensuring close alignment with the input question's knowledge. Through extensive -experiments, we demonstrate that DQ-LoRe significantly outperforms prior -state-of-the-art methods in the automatic selection of exemplars for GPT-4, -enhancing performance from 92.5% to 94.2%. Our comprehensive analysis further -reveals that DQ-LoRe consistently outperforms retrieval-based approaches in -terms of both performance and adaptability, especially in scenarios -characterized by distribution shifts. DQ-LoRe pushes the boundary of in-context -learning and opens up new avenues for addressing complex reasoning challenges. -Our code is released at -https://github.com/AI4fun/DQ-LoRe}{https://github.com/AI4fun/DQ-LoRe. - ---------------- - -### 24 Feb 2024 | [Stepwise Self-Consistent Mathematical Reasoning with Large Language Models](https://arxiv.org/abs/2402.17786) | [⬇️](https://arxiv.org/pdf/2402.17786) -*Zilong Zhao, Yao Rong, Dongyang Guo, Emek G\"ozl\"ukl\"u, Emir G\"ulboy, Enkelejda Kasneci* - - Using Large Language Models for complex mathematical reasoning is difficult, -primarily due to the complexity of multi-step reasoning. The main challenges of -this process include (1) selecting critical intermediate results to advance the -procedure, and (2) limited exploration of potential solutions. To address these -issues, we introduce a novel algorithm, namely Stepwise Self-Consistent -Chain-of-Thought (SSC-CoT). SSC-CoT employs a strategy of selecting -intermediate steps based on the intersection of various reasoning chains. -Additionally, SSC-CoT enables the model to discover critical intermediate steps -by querying a knowledge graph comprising relevant domain knowledge. To validate -SSC-CoT, we present a new dataset, TriMaster100, tailored for complex -trigonometry problems. This dataset contains 100 questions, with each solution -broken down into scored intermediate steps, facilitating a comprehensive -evaluation of the mathematical reasoning process. On TriMaster100, SSC-CoT -triples the effectiveness of the state-of-the-art methods. Furthermore, we -benchmark SSC-CoT on the widely recognized complex mathematical question -dataset, MATH level 5, and it surpasses the second-best method by 7.2% in -accuracy. Code and the TriMaster100 dataset can be found at: -https://github.com/zhao-zilong/ssc-cot. - ---------------- - -### 16 Mar 2023 | [ART: Automatic multi-step reasoning and tool-use for large language models](https://arxiv.org/abs/2303.09014) | [⬇️](https://arxiv.org/pdf/2303.09014) -*Bhargavi Paranjape, Scott Lundberg, Sameer Singh, Hannaneh Hajishirzi, Luke Zettlemoyer, Marco Tulio Ribeiro* - - Large language models (LLMs) can perform complex reasoning in few- and -zero-shot settings by generating intermediate chain of thought (CoT) reasoning -steps. Further, each reasoning step can rely on external tools to support -computation beyond the core LLM capabilities (e.g. search/running code). Prior -work on CoT prompting and tool use typically requires hand-crafting -task-specific demonstrations and carefully scripted interleaving of model -generations with tool use. We introduce Automatic Reasoning and Tool-use (ART), -a framework that uses frozen LLMs to automatically generate intermediate -reasoning steps as a program. Given a new task to solve, ART selects -demonstrations of multi-step reasoning and tool use from a task library. At -test time, ART seamlessly pauses generation whenever external tools are called, -and integrates their output before resuming generation. ART achieves a -substantial improvement over few-shot prompting and automatic CoT on unseen -tasks in the BigBench and MMLU benchmarks, and matches performance of -hand-crafted CoT prompts on a majority of these tasks. ART is also extensible, -and makes it easy for humans to improve performance by correcting errors in -task-specific programs or incorporating new tools, which we demonstrate by -drastically improving performance on select tasks with minimal human -intervention. - ---------------- - -### 26 May 2023 | [Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models](https://arxiv.org/abs/2305.04091) | [⬇️](https://arxiv.org/pdf/2305.04091) -*Lei Wang, Wanyu Xu, Yihuai Lan, Zhiqiang Hu, Yunshi Lan, Roy Ka-Wei Lee and Ee-Peng Lim* - - Large language models (LLMs) have recently been shown to deliver impressive -performance in various NLP tasks. To tackle multi-step reasoning tasks, -few-shot chain-of-thought (CoT) prompting includes a few manually crafted -step-by-step reasoning demonstrations which enable LLMs to explicitly generate -reasoning steps and improve their reasoning task accuracy. To eliminate the -manual effort, Zero-shot-CoT concatenates the target problem statement with -"Let's think step by step" as an input prompt to LLMs. Despite the success of -Zero-shot-CoT, it still suffers from three pitfalls: calculation errors, -missing-step errors, and semantic misunderstanding errors. To address the -missing-step errors, we propose Plan-and-Solve (PS) Prompting. It consists of -two components: first, devising a plan to divide the entire task into smaller -subtasks, and then carrying out the subtasks according to the plan. To address -the calculation errors and improve the quality of generated reasoning steps, we -extend PS prompting with more detailed instructions and derive PS+ prompting. -We evaluate our proposed prompting strategy on ten datasets across three -reasoning problems. The experimental results over GPT-3 show that our proposed -zero-shot prompting consistently outperforms Zero-shot-CoT across all datasets -by a large margin, is comparable to or exceeds Zero-shot-Program-of-Thought -Prompting, and has comparable performance with 8-shot CoT prompting on the math -reasoning problem. The code can be found at -https://github.com/AGI-Edgerunners/Plan-and-Solve-Prompting. - ---------------- - -### 02 Dec 2021 | [CX-ToM: Counterfactual Explanations with Theory-of-Mind for Enhancing Human Trust in Image Recognition Models](https://arxiv.org/abs/2109.01401) | [⬇️](https://arxiv.org/pdf/2109.01401) -*Arjun R. Akula, Keze Wang, Changsong Liu, Sari Saba-Sadiya, Hongjing Lu, Sinisa Todorovic, Joyce Chai, and Song-Chun Zhu* - - We propose CX-ToM, short for counterfactual explanations with theory-of mind, -a new explainable AI (XAI) framework for explaining decisions made by a deep -convolutional neural network (CNN). In contrast to the current methods in XAI -that generate explanations as a single shot response, we pose explanation as an -iterative communication process, i.e. dialog, between the machine and human -user. More concretely, our CX-ToM framework generates sequence of explanations -in a dialog by mediating the differences between the minds of machine and human -user. To do this, we use Theory of Mind (ToM) which helps us in explicitly -modeling human's intention, machine's mind as inferred by the human as well as -human's mind as inferred by the machine. Moreover, most state-of-the-art XAI -frameworks provide attention (or heat map) based explanations. In our work, we -show that these attention based explanations are not sufficient for increasing -human trust in the underlying CNN model. In CX-ToM, we instead use -counterfactual explanations called fault-lines which we define as follows: -given an input image I for which a CNN classification model M predicts class -c_pred, a fault-line identifies the minimal semantic-level features (e.g., -stripes on zebra, pointed ears of dog), referred to as explainable concepts, -that need to be added to or deleted from I in order to alter the classification -category of I by M to another specified class c_alt. We argue that, due to the -iterative, conceptual and counterfactual nature of CX-ToM explanations, our -framework is practical and more natural for both expert and non-expert users to -understand the internal workings of complex deep learning models. Extensive -quantitative and qualitative experiments verify our hypotheses, demonstrating -that our CX-ToM significantly outperforms the state-of-the-art explainable AI -models. - ---------------- - -### 18 Oct 2023 | [Concept-Guided Chain-of-Thought Prompting for Pairwise Comparison Scaling of Texts with Large Language Models](https://arxiv.org/abs/2310.12049) | [⬇️](https://arxiv.org/pdf/2310.12049) -*Patrick Y. Wu, Jonathan Nagler, Joshua A. Tucker, Solomon Messing* - - Existing text scaling methods often require a large corpus, struggle with -short texts, or require labeled data. We develop a text scaling method that -leverages the pattern recognition capabilities of generative large language -models (LLMs). Specifically, we propose concept-guided chain-of-thought -(CGCoT), which uses prompts designed to summarize ideas and identify target -parties in texts to generate concept-specific breakdowns, in many ways similar -to guidance for human coder content analysis. CGCoT effectively shifts pairwise -text comparisons from a reasoning problem to a pattern recognition problem. We -then pairwise compare concept-specific breakdowns using an LLM. We use the -results of these pairwise comparisons to estimate a scale using the -Bradley-Terry model. We use this approach to scale affective speech on Twitter. -Our measures correlate more strongly with human judgments than alternative -approaches like Wordfish. Besides a small set of pilot data to develop the -CGCoT prompts, our measures require no additional labeled data and produce -binary predictions comparable to a RoBERTa-Large model fine-tuned on thousands -of human-labeled tweets. We demonstrate how combining substantive knowledge -with LLMs can create state-of-the-art measures of abstract concepts. - ---------------- - -### 21 Nov 2019 | [Scalable Attentive Sentence-Pair Modeling via Distilled Sentence Embedding](https://arxiv.org/abs/1908.05161) | [⬇️](https://arxiv.org/pdf/1908.05161) -*Oren Barkan, Noam Razin, Itzik Malkiel, Ori Katz, Avi Caciularu, Noam Koenigstein* - - Recent state-of-the-art natural language understanding models, such as BERT -and XLNet, score a pair of sentences (A and B) using multiple cross-attention -operations - a process in which each word in sentence A attends to all words in -sentence B and vice versa. As a result, computing the similarity between a -query sentence and a set of candidate sentences, requires the propagation of -all query-candidate sentence-pairs throughout a stack of cross-attention -layers. This exhaustive process becomes computationally prohibitive when the -number of candidate sentences is large. In contrast, sentence embedding -techniques learn a sentence-to-vector mapping and compute the similarity -between the sentence vectors via simple elementary operations. In this paper, -we introduce Distilled Sentence Embedding (DSE) - a model that is based on -knowledge distillation from cross-attentive models, focusing on sentence-pair -tasks. The outline of DSE is as follows: Given a cross-attentive teacher model -(e.g. a fine-tuned BERT), we train a sentence embedding based student model to -reconstruct the sentence-pair scores obtained by the teacher model. We -empirically demonstrate the effectiveness of DSE on five GLUE sentence-pair -tasks. DSE significantly outperforms several ELMO variants and other sentence -embedding methods, while accelerating computation of the query-candidate -sentence-pairs similarities by several orders of magnitude, with an average -relative degradation of 4.6% compared to BERT. Furthermore, we show that DSE -produces sentence embeddings that reach state-of-the-art performance on -universal sentence representation benchmarks. Our code is made publicly -available at https://github.com/microsoft/Distilled-Sentence-Embedding. - ---------------- - -### 02 Dec 2023 | [Cumulative Reasoning with Large Language Models](https://arxiv.org/abs/2308.04371) | [⬇️](https://arxiv.org/pdf/2308.04371) -*Yifan Zhang, Jingqin Yang, Yang Yuan, Andrew Chi-Chih Yao* - - While language models are powerful and versatile, they often fail to address -highly complex problems. This is because solving complex problems requires -deliberate thinking, which has been only minimally guided during training. In -this paper, we propose a new method called Cumulative Reasoning (CR), which -employs language models in a cumulative and iterative manner to emulate human -thought processes. By decomposing tasks into smaller components, CR streamlines -the problem-solving process, rendering it both more manageable and effective. -For logical inference tasks, CR consistently outperforms existing methods with -an improvement up to 9.3%, and achieves an accuracy of 98.04% on the curated -FOLIO wiki dataset. In the context of the Game of 24, CR achieves an accuracy -of 98%, which signifies a substantial enhancement of 24% over the previous -state-of-the-art method. Finally, on the MATH dataset, we establish new -state-of-the-art results with 58.0% overall accuracy, surpassing the previous -best approach by a margin of 4.2%, and achieving 43% relative improvement on -the hardest level 5 problems (22.4% to 32.1%). Additionally, we expand the -concept of Cumulative Reasoning to incorporate a Python code environment, -deliberately omitting external aids such as retrieval and web browsing and -focusing solely on the LLM's intrinsic reasoning capabilities within a Python -code environment. Our experiments in this setting yielded impressive results, -with an overall accuracy of 72.2% on the MATH dataset, significantly -outperforming the PAL method with 38.8% relative improvement. Code is available -at https://github.com/iiis-ai/cumulative-reasoning. - ---------------- - - - - -# 3. - - -1. 🧠 Transformer-based Language Models (e.g., GPT-3, BERT) for capturing semantic knowledge -2. 📚 Memory-augmented Neural Networks (e.g., Neural Turing Machines, Differentiable Neural Computers) for episodic memory -3. 🤖 Multimodal Learning (e.g., Vision-and-Language Models, Embodied AI) for grounding memory in different modalities - -Keyword Glossary Outline with Emojis: - -- 💻 Transformer -- 🧠 Neural Network -- 💾 Memory Augmentation -- 🔢 Sequence Modeling -- 👁️ Multimodal Learning -- 🌐 Knowledge Representation - -```mermaid -graph LR - Transformer --> NeuralNetwork - Transformer --> SequenceModeling - NeuralNetwork --> MemoryAugmentation - MemoryAugmentation --> KnowledgeRepresentation - MultimodalLearning --> KnowledgeRepresentation - SequenceModeling --> MultimodalLearning -``` - -```python -# app.py -import streamlit as st - -st.title("Memory in AI: Semantic and Episodic Capabilities") - -st.header("State-of-the-Art Techniques") -st.markdown(""" -1. 🧠 Transformer-based Language Models (e.g., GPT-3, BERT) for capturing semantic knowledge -2. 📚 Memory-augmented Neural Networks (e.g., Neural Turing Machines, Differentiable Neural Computers) for episodic memory -3. 🤖 Multimodal Learning (e.g., Vision-and-Language Models, Embodied AI) for grounding memory in different modalities -""") - -st.header("Keyword Glossary") -st.markdown(""" -- 💻 Transformer -- 🧠 Neural Network -- 💾 Memory Augmentation -- 🔢 Sequence Modeling -- 👁️ Multimodal Learning -- 🌐 Knowledge Representation -""") - -st.header("Mermaid Model") -st.mermaid(""" -graph LR - Transformer --> NeuralNetwork - Transformer --> SequenceModeling - NeuralNetwork --> MemoryAugmentation - MemoryAugmentation --> KnowledgeRepresentation - MultimodalLearning --> KnowledgeRepresentation - SequenceModeling --> MultimodalLearning -""") - -st.header("Examples") -st.write("Telemed: Memory-augmented models could assist in capturing patient history and medical knowledge for better diagnosis and treatment.") -st.write("AI for Doctors and Developers: Multimodal learning techniques could help represent and reason about medical data from various sources (text, images, signals) for improved decision-making and collaboration.") - -if __name__ == "__main__": - st.run() -``` - - - - -# 🩺🔍 Search Results -### 14 Sep 2015 | [Deep Learning Applied to Image and Text Matching](https://arxiv.org/abs/1601.03478) | [⬇️](https://arxiv.org/pdf/1601.03478) -*Afroze Ibrahim Baqapuri* - - The ability to describe images with natural language sentences is the -hallmark for image and language understanding. Such a system has wide ranging -applications such as annotating images and using natural sentences to search -for images.In this project we focus on the task of bidirectional image -retrieval: such asystem is capable of retrieving an image based on a sentence -(image search) andretrieve sentence based on an image query (image annotation). -We present asystem based on a global ranking objective function which uses a -combinationof convolutional neural networks (CNN) and multi layer perceptrons -(MLP).It takes a pair of image and sentence and processes them in different -channels,finally embedding it into a common multimodal vector space. These -embeddingsencode abstract semantic information about the two inputs and can be -comparedusing traditional information retrieval approaches. For each such pair, -the modelreturns a score which is interpretted as a similarity metric. If this -score is high,the image and sentence are likely to convey similar meaning, and -if the score is low then they are likely not to. - The visual input is modeled via deep convolutional neural network. On -theother hand we explore three models for the textual module. The first one -isbag of words with an MLP. The second one uses n-grams (bigram, trigrams,and a -combination of trigram & skip-grams) with an MLP. The third is morespecialized -deep network specific for modeling variable length sequences (SSE).We report -comparable performance to recent work in the field, even though ouroverall -model is simpler. We also show that the training time choice of how wecan -generate our negative samples has a significant impact on performance, and can -be used to specialize the bi-directional system in one particular task. - ---------------- - -### 12 May 2023 | [IMAGINATOR: Pre-Trained Image+Text Joint Embeddings using Word-Level Grounding of Images](https://arxiv.org/abs/2305.10438) | [⬇️](https://arxiv.org/pdf/2305.10438) -*Varuna Krishna, S Suryavardan, Shreyash Mishra, Sathyanarayanan Ramamoorthy, Parth Patwa, Megha Chakraborty, Aman Chadha, Amitava Das, Amit Sheth* - - Word embeddings, i.e., semantically meaningful vector representation of -words, are largely influenced by the distributional hypothesis "You shall know -a word by the company it keeps" (Harris, 1954), whereas modern prediction-based -neural network embeddings rely on design choices and hyperparameter -optimization. Word embeddings like Word2Vec, GloVe etc. well capture the -contextuality and real-world analogies but contemporary convolution-based image -embeddings such as VGGNet, AlexNet, etc. do not capture contextual knowledge. -The popular king-queen analogy does not hold true for most commonly used vision -embeddings. - In this paper, we introduce a pre-trained joint embedding (JE), named -IMAGINATOR, trained on 21K distinct image objects level from 1M image+text -pairs. JE is a way to encode multimodal data into a vector space where the text -modality serves as the ground-ing key, which the complementary modality (in -this case, the image) is anchored with. IMAGINATOR encapsulates three -individual representations: (i) object-object co-location, (ii) word-object -co-location, and (iii) word-object correlation. These three ways capture -complementary aspects of the two modalities which are further combined to -obtain the final JEs. - Generated JEs are intrinsically evaluated to assess how well they capture the -contextuality and real-world analogies. We also evaluate pre-trained IMAGINATOR -JEs on three downstream tasks: (i) image captioning, (ii) Image2Tweet, and -(iii) text-based image retrieval. IMAGINATOR establishes a new standard on the -aforementioned down-stream tasks by outperforming the current SoTA on all the -selected tasks. IMAGINATOR will be made publicly available. The codes are -available at https://github.com/varunakk/IMAGINATOR - ---------------- - -### 06 Mar 2020 | [Distributional semantic modeling: a revised technique to train term/word vector space models applying the ontology-related approach](https://arxiv.org/abs/2003.03350) | [⬇️](https://arxiv.org/pdf/2003.03350) -*Oleksandr Palagin, Vitalii Velychko, Kyrylo Malakhov and Oleksandr Shchurov* - - We design a new technique for the distributional semantic modeling with a -neural network-based approach to learn distributed term representations (or -term embeddings) - term vector space models as a result, inspired by the recent -ontology-related approach (using different types of contextual knowledge such -as syntactic knowledge, terminological knowledge, semantic knowledge, etc.) to -the identification of terms (term extraction) and relations between them -(relation extraction) called semantic pre-processing technology - SPT. Our -method relies on automatic term extraction from the natural language texts and -subsequent formation of the problem-oriented or application-oriented (also -deeply annotated) text corpora where the fundamental entity is the term -(includes non-compositional and compositional terms). This gives us an -opportunity to changeover from distributed word representations (or word -embeddings) to distributed term representations (or term embeddings). This -transition will allow to generate more accurate semantic maps of different -subject domains (also, of relations between input terms - it is useful to -explore clusters and oppositions, or to test your hypotheses about them). The -semantic map can be represented as a graph using Vec2graph - a Python library -for visualizing word embeddings (term embeddings in our case) as dynamic and -interactive graphs. The Vec2graph library coupled with term embeddings will not -only improve accuracy in solving standard NLP tasks, but also update the -conventional concept of automated ontology development. The main practical -result of our work is the development kit (set of toolkits represented as web -service APIs and web application), which provides all necessary routines for -the basic linguistic pre-processing and the semantic pre-processing of the -natural language texts in Ukrainian for future training of term vector space -models. - ---------------- - -### 30 Aug 2018 | [Beyond Word Embeddings: Learning Entity and Concept Representations from Large Scale Knowledge Bases](https://arxiv.org/abs/1801.00388) | [⬇️](https://arxiv.org/pdf/1801.00388) -*Walid Shalaby, Wlodek Zadrozny, and Hongxia Jin* - - Text representations using neural word embeddings have proven effective in -many NLP applications. Recent researches adapt the traditional word embedding -models to learn vectors of multiword expressions (concepts/entities). However, -these methods are limited to textual knowledge bases (e.g., Wikipedia). In this -paper, we propose a novel and simple technique for integrating the knowledge -about concepts from two large scale knowledge bases of different structure -(Wikipedia and Probase) in order to learn concept representations. We adapt the -efficient skip-gram model to seamlessly learn from the knowledge in Wikipedia -text and Probase concept graph. We evaluate our concept embedding models on two -tasks: (1) analogical reasoning, where we achieve a state-of-the-art -performance of 91% on semantic analogies, (2) concept categorization, where we -achieve a state-of-the-art performance on two benchmark datasets achieving -categorization accuracy of 100% on one and 98% on the other. Additionally, we -present a case study to evaluate our model on unsupervised argument type -identification for neural semantic parsing. We demonstrate the competitive -accuracy of our unsupervised method and its ability to better generalize to out -of vocabulary entity mentions compared to the tedious and error prone methods -which depend on gazetteers and regular expressions. - ---------------- - -### 15 Jun 2021 | [Semantic Representation and Inference for NLP](https://arxiv.org/abs/2106.08117) | [⬇️](https://arxiv.org/pdf/2106.08117) -*Dongsheng Wang* - - Semantic representation and inference is essential for Natural Language -Processing (NLP). The state of the art for semantic representation and -inference is deep learning, and particularly Recurrent Neural Networks (RNNs), -Convolutional Neural Networks (CNNs), and transformer Self-Attention models. -This thesis investigates the use of deep learning for novel semantic -representation and inference, and makes contributions in the following three -areas: creating training data, improving semantic representations and extending -inference learning. In terms of creating training data, we contribute the -largest publicly available dataset of real-life factual claims for the purpose -of automatic claim verification (MultiFC), and we present a novel inference -model composed of multi-scale CNNs with different kernel sizes that learn from -external sources to infer fact checking labels. In terms of improving semantic -representations, we contribute a novel model that captures non-compositional -semantic indicators. By definition, the meaning of a non-compositional phrase -cannot be inferred from the individual meanings of its composing words (e.g., -hot dog). Motivated by this, we operationalize the compositionality of a phrase -contextually by enriching the phrase representation with external word -embeddings and knowledge graphs. Finally, in terms of inference learning, we -propose a series of novel deep learning architectures that improve inference by -using syntactic dependencies, by ensembling role guided attention heads, -incorporating gating layers, and concatenating multiple heads in novel and -effective ways. This thesis consists of seven publications (five published and -two under review). - ---------------- - -### 13 Nov 2021 | [Explainable Semantic Space by Grounding Language to Vision with Cross-Modal Contrastive Learning](https://arxiv.org/abs/2111.07180) | [⬇️](https://arxiv.org/pdf/2111.07180) -*Yizhen Zhang, Minkyu Choi, Kuan Han, Zhongming Liu* - - In natural language processing, most models try to learn semantic -representations merely from texts. The learned representations encode the -distributional semantics but fail to connect to any knowledge about the -physical world. In contrast, humans learn language by grounding concepts in -perception and action and the brain encodes grounded semantics for cognition. -Inspired by this notion and recent work in vision-language learning, we design -a two-stream model for grounding language learning in vision. The model -includes a VGG-based visual stream and a Bert-based language stream. The two -streams merge into a joint representational space. Through cross-modal -contrastive learning, the model first learns to align visual and language -representations with the MS COCO dataset. The model further learns to retrieve -visual objects with language queries through a cross-modal attention module and -to infer the visual relations between the retrieved objects through a bilinear -operator with the Visual Genome dataset. After training, the language stream of -this model is a stand-alone language model capable of embedding concepts in a -visually grounded semantic space. This semantic space manifests principal -dimensions explainable with human intuition and neurobiological knowledge. Word -embeddings in this semantic space are predictive of human-defined norms of -semantic features and are segregated into perceptually distinctive clusters. -Furthermore, the visually grounded language model also enables compositional -language understanding based on visual knowledge and multimodal image search -with queries based on images, texts, or their combinations. - ---------------- - -### 05 Aug 2021 | [Hybrid Reasoning Network for Video-based Commonsense Captioning](https://arxiv.org/abs/2108.02365) | [⬇️](https://arxiv.org/pdf/2108.02365) -*Weijiang Yu, Jian Liang, Lei Ji, Lu Li, Yuejian Fang, Nong Xiao, Nan Duan* - - The task of video-based commonsense captioning aims to generate event-wise -captions and meanwhile provide multiple commonsense descriptions (e.g., -attribute, effect and intention) about the underlying event in the video. Prior -works explore the commonsense captions by using separate networks for different -commonsense types, which is time-consuming and lacks mining the interaction of -different commonsense. In this paper, we propose a Hybrid Reasoning Network -(HybridNet) to endow the neural networks with the capability of semantic-level -reasoning and word-level reasoning. Firstly, we develop multi-commonsense -learning for semantic-level reasoning by jointly training different commonsense -types in a unified network, which encourages the interaction between the clues -of multiple commonsense descriptions, event-wise captions and videos. Then, -there are two steps to achieve the word-level reasoning: (1) a memory module -records the history predicted sequence from the previous generation processes; -(2) a memory-routed multi-head attention (MMHA) module updates the word-level -attention maps by incorporating the history information from the memory module -into the transformer decoder for word-level reasoning. Moreover, the multimodal -features are used to make full use of diverse knowledge for commonsense -reasoning. Experiments and abundant analysis on the large-scale -Video-to-Commonsense benchmark show that our HybridNet achieves -state-of-the-art performance compared with other methods. - ---------------- - -### 13 Oct 2020 | [Beyond 512 Tokens: Siamese Multi-depth Transformer-based Hierarchical Encoder for Long-Form Document Matching](https://arxiv.org/abs/2004.12297) | [⬇️](https://arxiv.org/pdf/2004.12297) -*Liu Yang, Mingyang Zhang, Cheng Li, Michael Bendersky, Marc Najork* - - Many natural language processing and information retrieval problems can be -formalized as the task of semantic matching. Existing work in this area has -been largely focused on matching between short texts (e.g., question -answering), or between a short and a long text (e.g., ad-hoc retrieval). -Semantic matching between long-form documents, which has many important -applications like news recommendation, related article recommendation and -document clustering, is relatively less explored and needs more research -effort. In recent years, self-attention based models like Transformers and BERT -have achieved state-of-the-art performance in the task of text matching. These -models, however, are still limited to short text like a few sentences or one -paragraph due to the quadratic computational complexity of self-attention with -respect to input text length. In this paper, we address the issue by proposing -the Siamese Multi-depth Transformer-based Hierarchical (SMITH) Encoder for -long-form document matching. Our model contains several innovations to adapt -self-attention models for longer text input. In order to better capture -sentence level semantic relations within a document, we pre-train the model -with a novel masked sentence block language modeling task in addition to the -masked word language modeling task used by BERT. Our experimental results on -several benchmark datasets for long-form document matching show that our -proposed SMITH model outperforms the previous state-of-the-art models including -hierarchical attention, multi-depth attention-based hierarchical recurrent -neural network, and BERT. Comparing to BERT based baselines, our model is able -to increase maximum input text length from 512 to 2048. We will open source a -Wikipedia based benchmark dataset, code and a pre-trained checkpoint to -accelerate future research on long-form document matching. - ---------------- - -### 27 Mar 2019 | [Learning semantic sentence representations from visually grounded language without lexical knowledge](https://arxiv.org/abs/1903.11393) | [⬇️](https://arxiv.org/pdf/1903.11393) -*Danny Merkx and Stefan Frank* - - Current approaches to learning semantic representations of sentences often -use prior word-level knowledge. The current study aims to leverage visual -information in order to capture sentence level semantics without the need for -word embeddings. We use a multimodal sentence encoder trained on a corpus of -images with matching text captions to produce visually grounded sentence -embeddings. Deep Neural Networks are trained to map the two modalities to a -common embedding space such that for an image the corresponding caption can be -retrieved and vice versa. We show that our model achieves results comparable to -the current state-of-the-art on two popular image-caption retrieval benchmark -data sets: MSCOCO and Flickr8k. We evaluate the semantic content of the -resulting sentence embeddings using the data from the Semantic Textual -Similarity benchmark task and show that the multimodal embeddings correlate -well with human semantic similarity judgements. The system achieves -state-of-the-art results on several of these benchmarks, which shows that a -system trained solely on multimodal data, without assuming any word -representations, is able to capture sentence level semantics. Importantly, this -result shows that we do not need prior knowledge of lexical level semantics in -order to model sentence level semantics. These findings demonstrate the -importance of visual information in semantics. - ---------------- - -### 07 Jan 2019 | [Vector representations of text data in deep learning](https://arxiv.org/abs/1901.01695) | [⬇️](https://arxiv.org/pdf/1901.01695) -*Karol Grzegorczyk* - - In this dissertation we report results of our research on dense distributed -representations of text data. We propose two novel neural models for learning -such representations. The first model learns representations at the document -level, while the second model learns word-level representations. - For document-level representations we propose Binary Paragraph Vector: a -neural network models for learning binary representations of text documents, -which can be used for fast document retrieval. We provide a thorough evaluation -of these models and demonstrate that they outperform the seminal method in the -field in the information retrieval task. We also report strong results in -transfer learning settings, where our models are trained on a generic text -corpus and then used to infer codes for documents from a domain-specific -dataset. In contrast to previously proposed approaches, Binary Paragraph Vector -models learn embeddings directly from raw text data. - For word-level representations we propose Disambiguated Skip-gram: a neural -network model for learning multi-sense word embeddings. Representations learned -by this model can be used in downstream tasks, like part-of-speech tagging or -identification of semantic relations. In the word sense induction task -Disambiguated Skip-gram outperforms state-of-the-art models on three out of -four benchmarks datasets. Our model has an elegant probabilistic -interpretation. Furthermore, unlike previous models of this kind, it is -differentiable with respect to all its parameters and can be trained with -backpropagation. In addition to quantitative results, we present qualitative -evaluation of Disambiguated Skip-gram, including two-dimensional visualisations -of selected word-sense embeddings. - ---------------- - -### 21 Oct 2017 | [Superposed Episodic and Semantic Memory via Sparse Distributed Representation](https://arxiv.org/abs/1710.07829) | [⬇️](https://arxiv.org/pdf/1710.07829) -*Rod Rinkus, Jasmin Leveille* - - The abilities to perceive, learn, and use generalities, similarities, -classes, i.e., semantic memory (SM), is central to cognition. Machine learning -(ML), neural network, and AI research has been primarily driven by tasks -requiring such abilities. However, another central facet of cognition, -single-trial formation of permanent memories of experiences, i.e., episodic -memory (EM), has had relatively little focus. Only recently has EM-like -functionality been added to Deep Learning (DL) models, e.g., Neural Turing -Machine, Memory Networks. However, in these cases: a) EM is implemented as a -separate module, which entails substantial data movement (and so, time and -power) between the DL net itself and EM; and b) individual items are stored -localistically within the EM, precluding realizing the exponential -representational efficiency of distributed over localist coding. We describe -Sparsey, an unsupervised, hierarchical, spatial/spatiotemporal associative -memory model differing fundamentally from mainstream ML models, most crucially, -in its use of sparse distributed representations (SDRs), or, cell assemblies, -which admits an extremely efficient, single-trial learning algorithm that maps -input similarity into code space similarity (measured as intersection). SDRs of -individual inputs are stored in superposition and because similarity is -preserved, the patterns of intersections over the assigned codes reflect the -similarity, i.e., statistical, structure, of all orders, not simply pairwise, -over the inputs. Thus, SM, i.e., a generative model, is built as a -computationally free side effect of the act of storing episodic memory traces -of individual inputs, either spatial patterns or sequences. We report initial -results on MNIST and on the Weizmann video event recognition benchmarks. While -we have not yet attained SOTA class accuracy, learning takes only minutes on a -single CPU. - ---------------- - -### 18 May 2022 | [Graph Adaptive Semantic Transfer for Cross-domain Sentiment Classification](https://arxiv.org/abs/2205.08772) | [⬇️](https://arxiv.org/pdf/2205.08772) -*Kai Zhang, Qi Liu, Zhenya Huang, Mingyue Cheng, Kun Zhang, Mengdi Zhang, Wei Wu, Enhong Chen* - - Cross-domain sentiment classification (CDSC) aims to use the transferable -semantics learned from the source domain to predict the sentiment of reviews in -the unlabeled target domain. Existing studies in this task attach more -attention to the sequence modeling of sentences while largely ignoring the rich -domain-invariant semantics embedded in graph structures (i.e., the -part-of-speech tags and dependency relations). As an important aspect of -exploring characteristics of language comprehension, adaptive graph -representations have played an essential role in recent years. To this end, in -the paper, we aim to explore the possibility of learning invariant semantic -features from graph-like structures in CDSC. Specifically, we present Graph -Adaptive Semantic Transfer (GAST) model, an adaptive syntactic graph embedding -method that is able to learn domain-invariant semantics from both word -sequences and syntactic graphs. More specifically, we first raise a -POS-Transformer module to extract sequential semantic features from the word -sequences as well as the part-of-speech tags. Then, we design a Hybrid Graph -Attention (HGAT) module to generate syntax-based semantic features by -considering the transferable dependency relations. Finally, we devise an -Integrated aDaptive Strategy (IDS) to guide the joint learning process of both -modules. Extensive experiments on four public datasets indicate that GAST -achieves comparable effectiveness to a range of state-of-the-art models. - ---------------- - -### 01 Nov 2019 | [Read, Highlight and Summarize: A Hierarchical Neural Semantic Encoder-based Approach](https://arxiv.org/abs/1910.03177) | [⬇️](https://arxiv.org/pdf/1910.03177) -*Rajeev Bhatt Ambati, Saptarashmi Bandyopadhyay and Prasenjit Mitra* - - Traditional sequence-to-sequence (seq2seq) models and other variations of the -attention-mechanism such as hierarchical attention have been applied to the -text summarization problem. Though there is a hierarchy in the way humans use -language by forming paragraphs from sentences and sentences from words, -hierarchical models have usually not worked that much better than their -traditional seq2seq counterparts. This effect is mainly because either the -hierarchical attention mechanisms are too sparse using hard attention or noisy -using soft attention. In this paper, we propose a method based on extracting -the highlights of a document; a key concept that is conveyed in a few -sentences. In a typical text summarization dataset consisting of documents that -are 800 tokens in length (average), capturing long-term dependencies is very -important, e.g., the last sentence can be grouped with the first sentence of a -document to form a summary. LSTMs (Long Short-Term Memory) proved useful for -machine translation. However, they often fail to capture long-term dependencies -while modeling long sequences. To address these issues, we have adapted Neural -Semantic Encoders (NSE) to text summarization, a class of memory-augmented -neural networks by improving its functionalities and proposed a novel -hierarchical NSE that outperforms similar previous models significantly. The -quality of summarization was improved by augmenting linguistic factors, namely -lemma, and Part-of-Speech (PoS) tags, to each word in the dataset for improved -vocabulary coverage and generalization. The hierarchical NSE model on factored -dataset outperformed the state-of-the-art by nearly 4 ROUGE points. We further -designed and used the first GPU-based self-critical Reinforcement Learning -model. - ---------------- - -### 07 Feb 2022 | [Classifying Textual Data with Pre-trained Vision Models through Transfer Learning and Data Transformations](https://arxiv.org/abs/2106.12479) | [⬇️](https://arxiv.org/pdf/2106.12479) -*Charaf Eddine Benarab* - - Knowledge is acquired by humans through experience, and no boundary is set -between the kinds of knowledge or skill levels we can achieve on different -tasks at the same time. When it comes to Neural Networks, that is not the case. -The breakthroughs in the field are extremely task and domain-specific. Vision -and language are dealt with in separate manners, using separate methods and -different datasets. Current text classification methods, mostly rely on -obtaining contextual embeddings for input text samples, then training a -classifier on the embedded dataset. Transfer learning in Language-related tasks -in general, is heavily used in obtaining the contextual text embeddings for the -input samples. In this work, we propose to use the knowledge acquired by -benchmark Vision Models which are trained on ImageNet to help a much smaller -architecture learn to classify text. A data transformation technique is used to -create a new image dataset, where each image represents a sentence embedding -from the last six layers of BERT, projected on a 2D plane using a t-SNE based -method. We trained five models containing early layers sliced from vision -models which are pretrained on ImageNet, on the created image dataset for the -IMDB dataset embedded with the last six layers of BERT. Despite the challenges -posed by the very different datasets, experimental results achieved by this -approach which links large pretrained models on both language and vision, are -very promising, without employing compute resources. Specifically, Sentiment -Analysis is achieved by five different models on the same image dataset -obtained after BERT embeddings are transformed into gray scale images. - Index Terms: BERT, Convolutional Neural Networks, Domain Adaptation, image -classification, Natural Language Processing, t-SNE, text classification, -Transfer Learning - ---------------- - -### 20 Jul 2016 | [Image Captioning with Deep Bidirectional LSTMs](https://arxiv.org/abs/1604.00790) | [⬇️](https://arxiv.org/pdf/1604.00790) -*Cheng Wang, Haojin Yang, Christian Bartz, Christoph Meinel* - - This work presents an end-to-end trainable deep bidirectional LSTM -(Long-Short Term Memory) model for image captioning. Our model builds on a deep -convolutional neural network (CNN) and two separate LSTM networks. It is -capable of learning long term visual-language interactions by making use of -history and future context information at high level semantic space. Two novel -deep bidirectional variant models, in which we increase the depth of -nonlinearity transition in different way, are proposed to learn hierarchical -visual-language embeddings. Data augmentation techniques such as multi-crop, -multi-scale and vertical mirror are proposed to prevent overfitting in training -deep models. We visualize the evolution of bidirectional LSTM internal states -over time and qualitatively analyze how our models "translate" image to -sentence. Our proposed models are evaluated on caption generation and -image-sentence retrieval tasks with three benchmark datasets: Flickr8K, -Flickr30K and MSCOCO datasets. We demonstrate that bidirectional LSTM models -achieve highly competitive performance to the state-of-the-art results on -caption generation even without integrating additional mechanism (e.g. object -detection, attention model etc.) and significantly outperform recent methods on -retrieval task. - ---------------- - -### 27 Apr 2022 | [SeqDialN: Sequential Visual Dialog Networks in Joint Visual-Linguistic Representation Space](https://arxiv.org/abs/2008.00397) | [⬇️](https://arxiv.org/pdf/2008.00397) -*Liu Yang* - - In this work, we formulate a visual dialog as an information flow in which -each piece of information is encoded with the joint visual-linguistic -representation of a single dialog round. Based on this formulation, we consider -the visual dialog task as a sequence problem consisting of ordered -visual-linguistic vectors. For featurization, we use a Dense Symmetric -Co-Attention network as a lightweight vison-language joint representation -generator to fuse multimodal features (i.e., image and text), yielding better -computation and data efficiencies. For inference, we propose two Sequential -Dialog Networks (SeqDialN): the first uses LSTM for information propagation -(IP) and the second uses a modified Transformer for multi-step reasoning (MR). -Our architecture separates the complexity of multimodal feature fusion from -that of inference, which allows simpler design of the inference engine. IP -based SeqDialN is our baseline with a simple 2-layer LSTM design that achieves -decent performance. MR based SeqDialN, on the other hand, recurrently refines -the semantic question/history representations through the self-attention stack -of Transformer and produces promising results on the visual dialog task. On -VisDial v1.0 test-std dataset, our best single generative SeqDialN achieves -62.54% NDCG and 48.63% MRR; our ensemble generative SeqDialN achieves 63.78% -NDCG and 49.98% MRR, which set a new state-of-the-art generative visual dialog -model. We fine-tune discriminative SeqDialN with dense annotations and boost -the performance up to 72.41% NDCG and 55.11% MRR. In this work, we discuss the -extensive experiments we have conducted to demonstrate the effectiveness of our -model components. We also provide visualization for the reasoning process from -the relevant conversation rounds and discuss our fine-tuning methods. Our code -is available at https://github.com/xiaoxiaoheimei/SeqDialN - ---------------- - -### 26 Jul 2022 | [Matching Visual Features to Hierarchical Semantic Topics for Image Paragraph Captioning](https://arxiv.org/abs/2105.04143) | [⬇️](https://arxiv.org/pdf/2105.04143) -*Dandan Guo, Ruiying Lu, Bo Chen, Zequn Zeng, Mingyuan Zhou* - - Observing a set of images and their corresponding paragraph-captions, a -challenging task is to learn how to produce a semantically coherent paragraph -to describe the visual content of an image. Inspired by recent successes in -integrating semantic topics into this task, this paper develops a plug-and-play -hierarchical-topic-guided image paragraph generation framework, which couples a -visual extractor with a deep topic model to guide the learning of a language -model. To capture the correlations between the image and text at multiple -levels of abstraction and learn the semantic topics from images, we design a -variational inference network to build the mapping from image features to -textual captions. To guide the paragraph generation, the learned hierarchical -topics and visual features are integrated into the language model, including -Long Short-Term Memory (LSTM) and Transformer, and jointly optimized. -Experiments on public datasets demonstrate that the proposed models, which are -competitive with many state-of-the-art approaches in terms of standard -evaluation metrics, can be used to both distill interpretable multi-layer -semantic topics and generate diverse and coherent captions. We release our code -at https://github.com/DandanGuo1993/VTCM-based-image-paragraph-caption.git - ---------------- - -### 12 Sep 2023 | [Grounded Language Acquisition From Object and Action Imagery](https://arxiv.org/abs/2309.06335) | [⬇️](https://arxiv.org/pdf/2309.06335) -*James Robert Kubricht and Zhaoyuan Yang and Jianwei Qiu and Peter Henry Tu* - - Deep learning approaches to natural language processing have made great -strides in recent years. While these models produce symbols that convey vast -amounts of diverse knowledge, it is unclear how such symbols are grounded in -data from the world. In this paper, we explore the development of a private -language for visual data representation by training emergent language (EL) -encoders/decoders in both i) a traditional referential game environment and ii) -a contrastive learning environment utilizing a within-class matching training -paradigm. An additional classification layer utilizing neural machine -translation and random forest classification was used to transform symbolic -representations (sequences of integer symbols) to class labels. These methods -were applied in two experiments focusing on object recognition and action -recognition. For object recognition, a set of sketches produced by human -participants from real imagery was used (Sketchy dataset) and for action -recognition, 2D trajectories were generated from 3D motion capture systems -(MOVI dataset). In order to interpret the symbols produced for data in each -experiment, gradient-weighted class activation mapping (Grad-CAM) methods were -used to identify pixel regions indicating semantic features which contribute -evidence towards symbols in learned languages. Additionally, a t-distributed -stochastic neighbor embedding (t-SNE) method was used to investigate embeddings -learned by CNN feature extractors. - ---------------- - -### 28 Jun 2023 | [AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn](https://arxiv.org/abs/2306.08640) | [⬇️](https://arxiv.org/pdf/2306.08640) -*Difei Gao, Lei Ji, Luowei Zhou, Kevin Qinghong Lin, Joya Chen, Zihan Fan, Mike Zheng Shou* - - Recent research on Large Language Models (LLMs) has led to remarkable -advancements in general NLP AI assistants. Some studies have further explored -the use of LLMs for planning and invoking models or APIs to address more -general multi-modal user queries. Despite this progress, complex visual-based -tasks still remain challenging due to the diverse nature of visual tasks. This -diversity is reflected in two aspects: 1) Reasoning paths. For many real-life -applications, it is hard to accurately decompose a query simply by examining -the query itself. Planning based on the specific visual content and the results -of each step is usually required. 2) Flexible inputs and intermediate results. -Input forms could be flexible for in-the-wild cases, and involves not only a -single image or video but a mixture of videos and images, e.g., a user-view -image with some reference videos. Besides, a complex reasoning process will -also generate diverse multimodal intermediate results, e.g., video narrations, -segmented video clips, etc. To address such general cases, we propose a -multi-modal AI assistant, AssistGPT, with an interleaved code and language -reasoning approach called Plan, Execute, Inspect, and Learn (PEIL) to integrate -LLMs with various tools. Specifically, the Planner is capable of using natural -language to plan which tool in Executor should do next based on the current -reasoning progress. Inspector is an efficient memory manager to assist the -Planner to feed proper visual information into a specific tool. Finally, since -the entire reasoning process is complex and flexible, a Learner is designed to -enable the model to autonomously explore and discover the optimal solution. We -conducted experiments on A-OKVQA and NExT-QA benchmarks, achieving -state-of-the-art results. Moreover, showcases demonstrate the ability of our -system to handle questions far more complex than those found in the benchmarks. - ---------------- - -### 09 Oct 2021 | [Leveraging recent advances in Pre-Trained Language Models forEye-Tracking Prediction](https://arxiv.org/abs/2110.04475) | [⬇️](https://arxiv.org/pdf/2110.04475) -*Varun Madhavan, Aditya Girish Pawate, Shraman Pal, Abhranil Chandra* - - Cognitively inspired Natural Language Pro-cessing uses human-derived -behavioral datalike eye-tracking data, which reflect the seman-tic -representations of language in the humanbrain to augment the neural nets to -solve arange of tasks spanning syntax and semanticswith the aim of teaching -machines about lan-guage processing mechanisms. In this paper,we use the ZuCo -1.0 and ZuCo 2.0 dataset con-taining the eye-gaze features to explore -differ-ent linguistic models to directly predict thesegaze features for each -word with respect to itssentence. We tried different neural networkmodels with -the words as inputs to predict thetargets. And after lots of experimentation -andfeature engineering finally devised a novel ar-chitecture consisting of -RoBERTa Token Clas-sifier with a dense layer on top for languagemodeling and a -stand-alone model consistingof dense layers followed by a transformer layerfor -the extra features we engineered. Finally,we took the mean of the outputs of -both thesemodels to make the final predictions. We eval-uated the models using -mean absolute error(MAE) and the R2 score for each target. - ---------------- - - - - -# 4 - - - - - -1. 🔑 Natural Language Processing for Clinical Documentation - - 🌟 Automated extraction of relevant information from medical reports - - 🌟 Real-time transcription and analysis of doctor-patient conversations - - 🌟 Conversational AI assistants for patient triage and diagnosis support - -2. 🔑 Computer Vision for Medical Imaging Analysis - - 🌟 Deep learning models for image segmentation and anomaly detection - - 🌟 Predictive modeling for disease diagnosis and prognosis - - 🌟 Augmented reality guidance for surgical procedures - -3. 🔑 Personalized Treatment Recommendation Systems - - 🌟 Patient-specific risk stratification and treatment planning - - 🌟 Integration of genomic data and electronic health records - - 🌟 AI-driven clinical decision support systems - -**Keywords and Glossary** - -1. 🔑 Natural Language Processing (NLP) -2. 🔑 Computer Vision (CV) -3. 🔑 Deep Learning (DL) -4. 🔑 Predictive Modeling -5. 🔑 Electronic Health Records (EHR) -6. 🔑 Genomics -7. 🔑 Clinical Decision Support Systems (CDSS) - -```mermaid -graph LR -NLP --> EHR -NLP --> CDSS -CV --> Imaging -CV --> Surgery -DL --> NLP -DL --> CV -DL --> Predictive -Predictive --> Treatment -Predictive --> Prognosis -EHR --> Treatment -Genomics --> Treatment -CDSS --> Treatment -``` - -```python -# app.py -import streamlit as st - -st.title("AI and Telemedical Applications") - -st.header("State of the Art Techniques") -st.write("1. 🔑 Natural Language Processing for Clinical Documentation") -st.write(" - 🌟 Automated extraction of relevant information from medical reports") -st.write(" - 🌟 Real-time transcription and analysis of doctor-patient conversations") -st.write(" - 🌟 Conversational AI assistants for patient triage and diagnosis support") - -st.write("2. 🔑 Computer Vision for Medical Imaging Analysis") -st.write(" - 🌟 Deep learning models for image segmentation and anomaly detection") -st.write(" - 🌟 Predictive modeling for disease diagnosis and prognosis") -st.write(" - 🌟 Augmented reality guidance for surgical procedures") - -st.write("3. 🔑 Personalized Treatment Recommendation Systems") -st.write(" - 🌟 Patient-specific risk stratification and treatment planning") -st.write(" - 🌟 Integration of genomic data and electronic health records") -st.write(" - 🌟 AI-driven clinical decision support systems") - -st.header("Keywords and Glossary") -st.write("1. 🔑 Natural Language Processing (NLP)") -st.write("2. 🔑 Computer Vision (CV)") -st.write("3. 🔑 Deep Learning (DL)") -st.write("4. 🔑 Predictive Modeling") -st.write("5. 🔑 Electronic Health Records (EHR)") -st.write("6. 🔑 Genomics") -st.write("7. 🔑 Clinical Decision Support Systems (CDSS)") - -st.header("Glossary Model") -st.mermaid(""" -graph LR -NLP --> EHR -NLP --> CDSS -CV --> Imaging -CV --> Surgery -DL --> NLP -DL --> CV -DL --> Predictive -Predictive --> Treatment -Predictive --> Prognosis -EHR --> Treatment -Genomics --> Treatment -CDSS --> Treatment -""") -``` - - - - -# 🩺🔍 Search Results -### 14 Sep 2022 | [Summarizing Patients Problems from Hospital Progress Notes Using Pre-trained Sequence-to-Sequence Models](https://arxiv.org/abs/2208.08408) | [⬇️](https://arxiv.org/pdf/2208.08408) -*Yanjun Gao, Dmitriy Dligach, Timothy Miller, Dongfang Xu, Matthew M. Churpek, Majid Afshar* - - Automatically summarizing patients' main problems from daily progress notes -using natural language processing methods helps to battle against information -and cognitive overload in hospital settings and potentially assists providers -with computerized diagnostic decision support. Problem list summarization -requires a model to understand, abstract, and generate clinical documentation. -In this work, we propose a new NLP task that aims to generate a list of -problems in a patient's daily care plan using input from the provider's -progress notes during hospitalization. We investigate the performance of T5 and -BART, two state-of-the-art seq2seq transformer architectures, in solving this -problem. We provide a corpus built on top of progress notes from publicly -available electronic health record progress notes in the Medical Information -Mart for Intensive Care (MIMIC)-III. T5 and BART are trained on general domain -text, and we experiment with a data augmentation method and a domain adaptation -pre-training method to increase exposure to medical vocabulary and knowledge. -Evaluation methods include ROUGE, BERTScore, cosine similarity on sentence -embedding, and F-score on medical concepts. Results show that T5 with domain -adaptive pre-training achieves significant performance gains compared to a -rule-based system and general domain pre-trained language models, indicating a -promising direction for tackling the problem summarization task. - ---------------- - -### 22 Jun 2023 | [Natural Language Processing in Electronic Health Records in Relation to Healthcare Decision-making: A Systematic Review](https://arxiv.org/abs/2306.12834) | [⬇️](https://arxiv.org/pdf/2306.12834) -*Elias Hossain, Rajib Rana, Niall Higgins, Jeffrey Soar, Prabal Datta Barua, Anthony R. Pisani, Ph.D, Kathryn Turner}* - - Background: Natural Language Processing (NLP) is widely used to extract -clinical insights from Electronic Health Records (EHRs). However, the lack of -annotated data, automated tools, and other challenges hinder the full -utilisation of NLP for EHRs. Various Machine Learning (ML), Deep Learning (DL) -and NLP techniques are studied and compared to understand the limitations and -opportunities in this space comprehensively. - Methodology: After screening 261 articles from 11 databases, we included 127 -papers for full-text review covering seven categories of articles: 1) medical -note classification, 2) clinical entity recognition, 3) text summarisation, 4) -deep learning (DL) and transfer learning architecture, 5) information -extraction, 6) Medical language translation and 7) other NLP applications. This -study follows the Preferred Reporting Items for Systematic Reviews and -Meta-Analyses (PRISMA) guidelines. - Result and Discussion: EHR was the most commonly used data type among the -selected articles, and the datasets were primarily unstructured. Various ML and -DL methods were used, with prediction or classification being the most common -application of ML or DL. The most common use cases were: the International -Classification of Diseases, Ninth Revision (ICD-9) classification, clinical -note analysis, and named entity recognition (NER) for clinical descriptions and -research on psychiatric disorders. - Conclusion: We find that the adopted ML models were not adequately assessed. -In addition, the data imbalance problem is quite important, yet we must find -techniques to address this underlining problem. Future studies should address -key limitations in studies, primarily identifying Lupus Nephritis, Suicide -Attempts, perinatal self-harmed and ICD-9 classification. - ---------------- - -### 16 Dec 2022 | [GatorTron: A Large Clinical Language Model to Unlock Patient Information from Unstructured Electronic Health Records](https://arxiv.org/abs/2203.03540) | [⬇️](https://arxiv.org/pdf/2203.03540) -*Xi Yang, Aokun Chen, Nima PourNejatian, Hoo Chang Shin, Kaleb E Smith, Christopher Parisien, Colin Compas, Cheryl Martin, Mona G Flores, Ying Zhang, Tanja Magoc, Christopher A Harle, Gloria Lipori, Duane A Mitchell, William R Hogan, Elizabeth A Shenkman, Jiang Bian, Yonghui Wu* - - There is an increasing interest in developing artificial intelligence (AI) -systems to process and interpret electronic health records (EHRs). Natural -language processing (NLP) powered by pretrained language models is the key -technology for medical AI systems utilizing clinical narratives. However, there -are few clinical language models, the largest of which trained in the clinical -domain is comparatively small at 110 million parameters (compared with billions -of parameters in the general domain). It is not clear how large clinical -language models with billions of parameters can help medical AI systems utilize -unstructured EHRs. In this study, we develop from scratch a large clinical -language model - GatorTron - using >90 billion words of text (including >82 -billion words of de-identified clinical text) and systematically evaluate it on -5 clinical NLP tasks including clinical concept extraction, medical relation -extraction, semantic textual similarity, natural language inference (NLI), and -medical question answering (MQA). We examine how (1) scaling up the number of -parameters and (2) scaling up the size of the training data could benefit these -NLP tasks. GatorTron models scale up the clinical language model from 110 -million to 8.9 billion parameters and improve 5 clinical NLP tasks (e.g., 9.6% -and 9.5% improvement in accuracy for NLI and MQA), which can be applied to -medical AI systems to improve healthcare delivery. The GatorTron models are -publicly available at: -https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara/models/gatortron_og. - ---------------- - -### 14 Mar 2023 | [Progress Note Understanding -- Assessment and Plan Reasoning: Overview of the 2022 N2C2 Track 3 Shared Task](https://arxiv.org/abs/2303.08038) | [⬇️](https://arxiv.org/pdf/2303.08038) -*Yanjun Gao, Dmitriy Dligach, Timothy Miller, Matthew M Churpek, Ozlem Uzuner, Majid Afshar* - - Daily progress notes are common types in the electronic health record (EHR) -where healthcare providers document the patient's daily progress and treatment -plans. The EHR is designed to document all the care provided to patients, but -it also enables note bloat with extraneous information that distracts from the -diagnoses and treatment plans. Applications of natural language processing -(NLP) in the EHR is a growing field with the majority of methods in information -extraction. Few tasks use NLP methods for downstream diagnostic decision -support. We introduced the 2022 National NLP Clinical Challenge (N2C2) Track 3: -Progress Note Understanding - Assessment and Plan Reasoning as one step towards -a new suite of tasks. The Assessment and Plan Reasoning task focuses on the -most critical components of progress notes, Assessment and Plan subsections -where health problems and diagnoses are contained. The goal of the task was to -develop and evaluate NLP systems that automatically predict causal relations -between the overall status of the patient contained in the Assessment section -and its relation to each component of the Plan section which contains the -diagnoses and treatment plans. The goal of the task was to identify and -prioritize diagnoses as the first steps in diagnostic decision support to find -the most relevant information in long documents like daily progress notes. We -present the results of 2022 n2c2 Track 3 and provide a description of the data, -evaluation, participation and system performance. - ---------------- - -### 07 Apr 2021 | [COVID-19 SignSym: a fast adaptation of a general clinical NLP tool to identify and normalize COVID-19 signs and symptoms to OMOP common data model](https://arxiv.org/abs/2007.10286) | [⬇️](https://arxiv.org/pdf/2007.10286) -*Jingqi Wang, Noor Abu-el-rub, Josh Gray, Huy Anh Pham, Yujia Zhou, Frank Manion, Mei Liu, Xing Song, Hua Xu, Masoud Rouhizadeh, Yaoyun Zhang* - - The COVID-19 pandemic swept across the world rapidly, infecting millions of -people. An efficient tool that can accurately recognize important clinical -concepts of COVID-19 from free text in electronic health records (EHRs) will be -valuable to accelerate COVID-19 clinical research. To this end, this study aims -at adapting the existing CLAMP natural language processing tool to quickly -build COVID-19 SignSym, which can extract COVID-19 signs/symptoms and their 8 -attributes (body location, severity, temporal expression, subject, condition, -uncertainty, negation, and course) from clinical text. The extracted -information is also mapped to standard concepts in the Observational Medical -Outcomes Partnership common data model. A hybrid approach of combining deep -learning-based models, curated lexicons, and pattern-based rules was applied to -quickly build the COVID-19 SignSym from CLAMP, with optimized performance. Our -extensive evaluation using 3 external sites with clinical notes of COVID-19 -patients, as well as the online medical dialogues of COVID-19, shows COVID-19 -Sign-Sym can achieve high performance across data sources. The workflow used -for this study can be generalized to other use cases, where existing clinical -natural language processing tools need to be customized for specific -information needs within a short time. COVID-19 SignSym is freely accessible to -the research community as a downloadable package -(https://clamp.uth.edu/covid/nlp.php) and has been used by 16 healthcare -organizations to support clinical research of COVID-19. - ---------------- - -### 11 Apr 2019 | [Text2Node: a Cross-Domain System for Mapping Arbitrary Phrases to a Taxonomy](https://arxiv.org/abs/1905.01958) | [⬇️](https://arxiv.org/pdf/1905.01958) -*Rohollah Soltani and Alexandre Tomberg* - - Electronic health record (EHR) systems are used extensively throughout the -healthcare domain. However, data interchangeability between EHR systems is -limited due to the use of different coding standards across systems. Existing -methods of mapping coding standards based on manual human experts mapping, -dictionary mapping, symbolic NLP and classification are unscalable and cannot -accommodate large scale EHR datasets. - In this work, we present Text2Node, a cross-domain mapping system capable of -mapping medical phrases to concepts in a large taxonomy (such as SNOMED CT). -The system is designed to generalize from a limited set of training samples and -map phrases to elements of the taxonomy that are not covered by training data. -As a result, our system is scalable, robust to wording variants between coding -systems and can output highly relevant concepts when no exact concept exists in -the target taxonomy. Text2Node operates in three main stages: first, the -lexicon is mapped to word embeddings; second, the taxonomy is vectorized using -node embeddings; and finally, the mapping function is trained to connect the -two embedding spaces. We compared multiple algorithms and architectures for -each stage of the training, including GloVe and FastText word embeddings, CNN -and Bi-LSTM mapping functions, and node2vec for node embeddings. We confirmed -the robustness and generalisation properties of Text2Node by mapping ICD-9-CM -Diagnosis phrases to SNOMED CT and by zero-shot training at comparable -accuracy. - This system is a novel methodological contribution to the task of normalizing -and linking phrases to a taxonomy, advancing data interchangeability in -healthcare. When applied, the system can use electronic health records to -generate an embedding that incorporates taxonomical medical knowledge to -improve clinical predictive models. - ---------------- - -### 08 Oct 2023 | [Extraction of Medication and Temporal Relation from Clinical Text using Neural Language Models](https://arxiv.org/abs/2310.02229) | [⬇️](https://arxiv.org/pdf/2310.02229) -*Hangyu Tu and Lifeng Han and Goran Nenadic* - - Clinical texts, represented in electronic medical records (EMRs), contain -rich medical information and are essential for disease prediction, personalised -information recommendation, clinical decision support, and medication pattern -mining and measurement. Relation extractions between medication mentions and -temporal information can further help clinicians better understand the -patients' treatment history. To evaluate the performances of deep learning (DL) -and large language models (LLMs) in medication extraction and temporal -relations classification, we carry out an empirical investigation of -\textbf{MedTem} project using several advanced learning structures including -BiLSTM-CRF and CNN-BiLSTM for a clinical domain named entity recognition (NER), -and BERT-CNN for temporal relation extraction (RE), in addition to the -exploration of different word embedding techniques. Furthermore, we also -designed a set of post-processing roles to generate structured output on -medications and the temporal relation. Our experiments show that CNN-BiLSTM -slightly wins the BiLSTM-CRF model on the i2b2-2009 clinical NER task yielding -75.67, 77.83, and 78.17 for precision, recall, and F1 scores using Macro -Average. BERT-CNN model also produced reasonable evaluation scores 64.48, -67.17, and 65.03 for P/R/F1 using Macro Avg on the temporal relation extraction -test set from i2b2-2012 challenges. Code and Tools from MedTem will be hosted -at \url{https://github.com/HECTA-UoM/MedTem} - ---------------- - -### 13 Oct 2020 | [A Natural Language Processing Pipeline of Chinese Free-text Radiology Reports for Liver Cancer Diagnosis](https://arxiv.org/abs/2004.13848) | [⬇️](https://arxiv.org/pdf/2004.13848) -*Honglei Liu, Yan Xu, Zhiqiang Zhang, Ni Wang, Yanqun Huang, Yanjun Hu, Zhenghan Yang, Rui Jiang, Hui Chen* - - Despite the rapid development of natural language processing (NLP) -implementation in electronic medical records (EMRs), Chinese EMRs processing -remains challenging due to the limited corpus and specific grammatical -characteristics, especially for radiology reports. In this study, we designed -an NLP pipeline for the direct extraction of clinically relevant features from -Chinese radiology reports, which is the first key step in computer-aided -radiologic diagnosis. The pipeline was comprised of named entity recognition, -synonyms normalization, and relationship extraction to finally derive the -radiological features composed of one or more terms. In named entity -recognition, we incorporated lexicon into deep learning model bidirectional -long short-term memory-conditional random field (BiLSTM-CRF), and the model -finally achieved an F1 score of 93.00%. With the extracted radiological -features, least absolute shrinkage and selection operator and machine learning -methods (support vector machine, random forest, decision tree, and logistic -regression) were used to build the classifiers for liver cancer prediction. For -liver cancer diagnosis, random forest had the highest predictive performance in -liver cancer diagnosis (F1 score 86.97%, precision 87.71%, and recall 86.25%). -This work was a comprehensive NLP study focusing on Chinese radiology reports -and the application of NLP in cancer risk prediction. The proposed NLP pipeline -for the radiological feature extraction could be easily implemented in other -kinds of Chinese clinical texts and other disease predictive tasks. - ---------------- - -### 14 Dec 2022 | [DR.BENCH: Diagnostic Reasoning Benchmark for Clinical Natural Language Processing](https://arxiv.org/abs/2209.14901) | [⬇️](https://arxiv.org/pdf/2209.14901) -*Yanjun Gao, Dmitriy Dligach, Timothy Miller, John Caskey, Brihat Sharma, Matthew M Churpek, Majid Afshar* - - The meaningful use of electronic health records (EHR) continues to progress -in the digital era with clinical decision support systems augmented by -artificial intelligence. A priority in improving provider experience is to -overcome information overload and reduce the cognitive burden so fewer medical -errors and cognitive biases are introduced during patient care. One major type -of medical error is diagnostic error due to systematic or predictable errors in -judgment that rely on heuristics. The potential for clinical natural language -processing (cNLP) to model diagnostic reasoning in humans with forward -reasoning from data to diagnosis and potentially reduce the cognitive burden -and medical error has not been investigated. Existing tasks to advance the -science in cNLP have largely focused on information extraction and named entity -recognition through classification tasks. We introduce a novel suite of tasks -coined as Diagnostic Reasoning Benchmarks, DR.BENCH, as a new benchmark for -developing and evaluating cNLP models with clinical diagnostic reasoning -ability. The suite includes six tasks from ten publicly available datasets -addressing clinical text understanding, medical knowledge reasoning, and -diagnosis generation. DR.BENCH is the first clinical suite of tasks designed to -be a natural language generation framework to evaluate pre-trained language -models. Experiments with state-of-the-art pre-trained generative language -models using large general domain models and models that were continually -trained on a medical corpus demonstrate opportunities for improvement when -evaluated in DR. BENCH. We share DR. BENCH as a publicly available GitLab -repository with a systematic approach to load and evaluate models for the cNLP -community. - ---------------- - -### 20 Jun 2017 | [Medical Concept Representation Learning from Electronic Health Records and its Application on Heart Failure Prediction](https://arxiv.org/abs/1602.03686) | [⬇️](https://arxiv.org/pdf/1602.03686) -*Edward Choi, Andy Schuetz, Walter F. Stewart, Jimeng Sun* - - Objective: To transform heterogeneous clinical data from electronic health -records into clinically meaningful constructed features using data driven -method that rely, in part, on temporal relations among data. Materials and -Methods: The clinically meaningful representations of medical concepts and -patients are the key for health analytic applications. Most of existing -approaches directly construct features mapped to raw data (e.g., ICD or CPT -codes), or utilize some ontology mapping such as SNOMED codes. However, none of -the existing approaches leverage EHR data directly for learning such concept -representation. We propose a new way to represent heterogeneous medical -concepts (e.g., diagnoses, medications and procedures) based on co-occurrence -patterns in longitudinal electronic health records. The intuition behind the -method is to map medical concepts that are co-occuring closely in time to -similar concept vectors so that their distance will be small. We also derive a -simple method to construct patient vectors from the related medical concept -vectors. Results: For qualitative evaluation, we study similar medical concepts -across diagnosis, medication and procedure. In quantitative evaluation, our -proposed representation significantly improves the predictive modeling -performance for onset of heart failure (HF), where classification methods (e.g. -logistic regression, neural network, support vector machine and K-nearest -neighbors) achieve up to 23% improvement in area under the ROC curve (AUC) -using this proposed representation. Conclusion: We proposed an effective method -for patient and medical concept representation learning. The resulting -representation can map relevant concepts together and also improves predictive -modeling performance. - ---------------- - -### 10 Mar 2021 | [Automated Coding of Under-Studied Medical Concept Domains: Linking Physical Activity Reports to the International Classification of Functioning, Disability, and Health](https://arxiv.org/abs/2011.13978) | [⬇️](https://arxiv.org/pdf/2011.13978) -*Denis Newman-Griffis and Eric Fosler-Lussier* - - Linking clinical narratives to standardized vocabularies and coding systems -is a key component of unlocking the information in medical text for analysis. -However, many domains of medical concepts lack well-developed terminologies -that can support effective coding of medical text. We present a framework for -developing natural language processing (NLP) technologies for automated coding -of under-studied types of medical information, and demonstrate its -applicability via a case study on physical mobility function. Mobility is a -component of many health measures, from post-acute care and surgical outcomes -to chronic frailty and disability, and is coded in the International -Classification of Functioning, Disability, and Health (ICF). However, mobility -and other types of functional activity remain under-studied in medical -informatics, and neither the ICF nor commonly-used medical terminologies -capture functional status terminology in practice. We investigated two -data-driven paradigms, classification and candidate selection, to link -narrative observations of mobility to standardized ICF codes, using a dataset -of clinical narratives from physical therapy encounters. Recent advances in -language modeling and word embedding were used as features for established -machine learning models and a novel deep learning approach, achieving a macro -F-1 score of 84% on linking mobility activity reports to ICF codes. Both -classification and candidate selection approaches present distinct strengths -for automated coding in under-studied domains, and we highlight that the -combination of (i) a small annotated data set; (ii) expert definitions of codes -of interest; and (iii) a representative text corpus is sufficient to produce -high-performing automated coding systems. This study has implications for the -ongoing growth of NLP tools for a variety of specialized applications in -clinical care and research. - ---------------- - -### 26 Oct 2022 | [Few-Shot Learning for Clinical Natural Language Processing Using Siamese Neural Networks](https://arxiv.org/abs/2208.14923) | [⬇️](https://arxiv.org/pdf/2208.14923) -*David Oniani, Sonish Sivarajkumar, Yanshan Wang* - - Clinical Natural Language Processing (NLP) has become an emerging technology -in healthcare that leverages a large amount of free-text data in electronic -health records (EHRs) to improve patient care, support clinical decisions, and -facilitate clinical and translational science research. Recently, deep learning -has achieved state-of-the-art performance in many clinical NLP tasks. However, -training deep learning models usually requires large annotated datasets, which -are normally not publicly available and can be time-consuming to build in -clinical domains. Working with smaller annotated datasets is typical in -clinical NLP and therefore, ensuring that deep learning models perform well is -crucial for the models to be used in real-world applications. A widely adopted -approach is fine-tuning existing Pre-trained Language Models (PLMs), but these -attempts fall short when the training dataset contains only a few annotated -samples. Few-Shot Learning (FSL) has recently been investigated to tackle this -problem. Siamese Neural Network (SNN) has been widely utilized as an FSL -approach in computer vision, but has not been studied well in NLP. Furthermore, -the literature on its applications in clinical domains is scarce. In this -paper, we propose two SNN-based FSL approaches for clinical NLP, including -Pre-Trained SNN (PT-SNN) and SNN with Second-Order Embeddings (SOE-SNN). We -evaluated the proposed approaches on two clinical tasks, namely clinical text -classification and clinical named entity recognition. We tested three few-shot -settings including 4-shot, 8-shot, and 16-shot learning. Both clinical NLP -tasks were benchmarked using three PLMs, including BERT,BioBERT, and -BioClinicalBERT. The experimental results verified the effectiveness of the -proposed SNN-based FSL approaches in both NLP tasks. - ---------------- - -### 06 Feb 2024 | [Intensive Vision-guided Network for Radiology Report Generation](https://arxiv.org/abs/2402.03754) | [⬇️](https://arxiv.org/pdf/2402.03754) -*Fudan Zheng, Mengfei Li, Ying Wang, Weijiang Yu, Ruixuan Wang, Zhiguang Chen, Nong Xiao, and Yutong Lu* - - Automatic radiology report generation is booming due to its huge application -potential for the healthcare industry. However, existing computer vision and -natural language processing approaches to tackle this problem are limited in -two aspects. First, when extracting image features, most of them neglect -multi-view reasoning in vision and model single-view structure of medical -images, such as space-view or channel-view. However, clinicians rely on -multi-view imaging information for comprehensive judgment in daily clinical -diagnosis. Second, when generating reports, they overlook context reasoning -with multi-modal information and focus on pure textual optimization utilizing -retrieval-based methods. We aim to address these two issues by proposing a -model that better simulates clinicians' perspectives and generates more -accurate reports. Given the above limitation in feature extraction, we propose -a Globally-intensive Attention (GIA) module in the medical image encoder to -simulate and integrate multi-view vision perception. GIA aims to learn three -types of vision perception: depth view, space view, and pixel view. On the -other hand, to address the above problem in report generation, we explore how -to involve multi-modal signals to generate precisely matched reports, i.e., how -to integrate previously predicted words with region-aware visual content in -next word prediction. Specifically, we design a Visual Knowledge-guided Decoder -(VKGD), which can adaptively consider how much the model needs to rely on -visual information and previously predicted text to assist next word -prediction. Hence, our final Intensive Vision-guided Network (IVGN) framework -includes a GIA-guided Visual Encoder and the VKGD. Experiments on two -commonly-used datasets IU X-Ray and MIMIC-CXR demonstrate the superior ability -of our method compared with other state-of-the-art approaches. - ---------------- - -### 01 Jul 2023 | [Hierarchical Pretraining for Biomedical Term Embeddings](https://arxiv.org/abs/2307.00266) | [⬇️](https://arxiv.org/pdf/2307.00266) -*Bryan Cai, Sihang Zeng, Yucong Lin, Zheng Yuan, Doudou Zhou, and Lu Tian* - - Electronic health records (EHR) contain narrative notes that provide -extensive details on the medical condition and management of patients. Natural -language processing (NLP) of clinical notes can use observed frequencies of -clinical terms as predictive features for downstream applications such as -clinical decision making and patient trajectory prediction. However, due to the -vast number of highly similar and related clinical concepts, a more effective -modeling strategy is to represent clinical terms as semantic embeddings via -representation learning and use the low dimensional embeddings as feature -vectors for predictive modeling. To achieve efficient representation, -fine-tuning pretrained language models with biomedical knowledge graphs may -generate better embeddings for biomedical terms than those from standard -language models alone. These embeddings can effectively discriminate synonymous -pairs of from those that are unrelated. However, they often fail to capture -different degrees of similarity or relatedness for concepts that are -hierarchical in nature. To overcome this limitation, we propose HiPrBERT, a -novel biomedical term representation model trained on additionally complied -data that contains hierarchical structures for various biomedical terms. We -modify an existing contrastive loss function to extract information from these -hierarchies. Our numerical experiments demonstrate that HiPrBERT effectively -learns the pair-wise distance from hierarchical information, resulting in a -substantially more informative embeddings for further biomedical applications - ---------------- - -### 27 Jun 2019 | [Training Models to Extract Treatment Plans from Clinical Notes Using Contents of Sections with Headings](https://arxiv.org/abs/1906.11930) | [⬇️](https://arxiv.org/pdf/1906.11930) -*Ananya Poddar, Bharath Dandala, Murthy Devarakonda* - - Objective: Using natural language processing (NLP) to find sentences that -state treatment plans in a clinical note, would automate plan extraction and -would further enable their use in tools that help providers and care managers. -However, as in the most NLP tasks on clinical text, creating gold standard to -train and test NLP models is tedious and expensive. Fortuitously, sometimes but -not always clinical notes contain sections with a heading that identifies the -section as a plan. Leveraging contents of such labeled sections as a noisy -training data, we assessed accuracy of NLP models trained with the data. - Methods: We used common variations of plan headings and rule-based heuristics -to find plan sections with headings in clinical notes, and we extracted -sentences from them and formed a noisy training data of plan sentences. We -trained Support Vector Machine (SVM) and Convolutional Neural Network (CNN) -models with the data. We measured accuracy of the trained models on the noisy -dataset using ten-fold cross validation and separately on a set-aside manually -annotated dataset. - Results: About 13% of 117,730 clinical notes contained treatment plans -sections with recognizable headings in the 1001 longitudinal patient records -that were obtained from Cleveland Clinic under an IRB approval. We were able to -extract and create a noisy training data of 13,492 plan sentences from the -clinical notes. CNN achieved best F measures, 0.91 and 0.97 in the -cross-validation and set-aside evaluation experiments respectively. SVM -slightly underperformed with F measures of 0.89 and 0.96 in the same -experiments. - Conclusion: Our study showed that the training supervised learning models -using noisy plan sentences was effective in identifying them in all clinical -notes. More broadly, sections with informal headings in clinical notes can be a -good source for generating effective training data. - ---------------- - -### 22 Sep 2023 | [Large Language Models and Control Mechanisms Improve Text Readability of Biomedical Abstracts](https://arxiv.org/abs/2309.13202) | [⬇️](https://arxiv.org/pdf/2309.13202) -*Zihao Li, Samuel Belkadi, Nicolo Micheletti, Lifeng Han, Matthew Shardlow, Goran Nenadic* - - Biomedical literature often uses complex language and inaccessible -professional terminologies. That is why simplification plays an important role -in improving public health literacy. Applying Natural Language Processing (NLP) -models to automate such tasks allows for quick and direct accessibility for lay -readers. In this work, we investigate the ability of state-of-the-art large -language models (LLMs) on the task of biomedical abstract simplification, using -the publicly available dataset for plain language adaptation of biomedical -abstracts (\textbf{PLABA}). The methods applied include domain fine-tuning and -prompt-based learning (PBL) on: 1) Encoder-decoder models (T5, SciFive, and -BART), 2) Decoder-only GPT models (GPT-3.5 and GPT-4) from OpenAI and BioGPT, -and 3) Control-token mechanisms on BART-based models. We used a range of -automatic evaluation metrics, including BLEU, ROUGE, SARI, and BERTscore, and -also conducted human evaluations. BART-Large with Control Token (BART-L-w-CT) -mechanisms reported the highest SARI score of 46.54 and T5-base reported the -highest BERTscore 72.62. In human evaluation, BART-L-w-CTs achieved a better -simplicity score over T5-Base (2.9 vs. 2.2), while T5-Base achieved a better -meaning preservation score over BART-L-w-CTs (3.1 vs. 2.6). We also categorised -the system outputs with examples, hoping this will shed some light for future -research on this task. Our code, fine-tuned models, and data splits are -available at \url{https://github.com/HECTA-UoM/PLABA-MU} - ---------------- - -### 18 Jan 2022 | [Label Dependent Attention Model for Disease Risk Prediction Using Multimodal Electronic Health Records](https://arxiv.org/abs/2201.06779) | [⬇️](https://arxiv.org/pdf/2201.06779) -*Shuai Niu and Qing Yin and Yunya Song and Yike Guo and Xian Yang* - - Disease risk prediction has attracted increasing attention in the field of -modern healthcare, especially with the latest advances in artificial -intelligence (AI). Electronic health records (EHRs), which contain -heterogeneous patient information, are widely used in disease risk prediction -tasks. One challenge of applying AI models for risk prediction lies in -generating interpretable evidence to support the prediction results while -retaining the prediction ability. In order to address this problem, we propose -the method of jointly embedding words and labels whereby attention modules -learn the weights of words from medical notes according to their relevance to -the names of risk prediction labels. This approach boosts interpretability by -employing an attention mechanism and including the names of prediction tasks in -the model. However, its application is only limited to the handling of textual -inputs such as medical notes. In this paper, we propose a label dependent -attention model LDAM to 1) improve the interpretability by exploiting -Clinical-BERT (a biomedical language model pre-trained on a large clinical -corpus) to encode biomedically meaningful features and labels jointly; 2) -extend the idea of joint embedding to the processing of time-series data, and -develop a multi-modal learning framework for integrating heterogeneous -information from medical notes and time-series health status indicators. To -demonstrate our method, we apply LDAM to the MIMIC-III dataset to predict -different disease risks. We evaluate our method both quantitatively and -qualitatively. Specifically, the predictive power of LDAM will be shown, and -case studies will be carried out to illustrate its interpretability. - ---------------- - -### 23 May 2023 | [Detecting automatically the layout of clinical documents to enhance the performances of downstream natural language processing](https://arxiv.org/abs/2305.13817) | [⬇️](https://arxiv.org/pdf/2305.13817) -*Christel G\'erardin, Perceval Wajsb\"urt, Basile Dura, Alice Calliger, Alexandre Moucher, Xavier Tannier and Romain Bey* - - Objective:Develop and validate an algorithm for analyzing the layout of PDF -clinical documents to improve the performance of downstream natural language -processing tasks. Materials and Methods: We designed an algorithm to process -clinical PDF documents and extract only clinically relevant text. The algorithm -consists of several steps: initial text extraction using a PDF parser, followed -by classification into categories such as body text, left notes, and footers -using a Transformer deep neural network architecture, and finally an -aggregation step to compile the lines of a given label in the text. We -evaluated the technical performance of the body text extraction algorithm by -applying it to a random sample of documents that were annotated. Medical -performance was evaluated by examining the extraction of medical concepts of -interest from the text in their respective sections. Finally, we tested an -end-to-end system on a medical use case of automatic detection of acute -infection described in the hospital report. Results:Our algorithm achieved -per-line precision, recall, and F1 score of 98.4, 97.0, and 97.7, respectively, -for body line extraction. The precision, recall, and F1 score per document for -the acute infection detection algorithm were 82.54 (95CI 72.86-91.60), 85.24 -(95CI 76.61-93.70), 83.87 (95CI 76, 92-90.08) with exploitation of the results -of the advanced body extraction algorithm, respectively. Conclusion:We have -developed and validated a system for extracting body text from clinical -documents in PDF format by identifying their layout. We were able to -demonstrate that this preprocessing allowed us to obtain better performances -for a common downstream task, i.e., the extraction of medical concepts in their -respective sections, thus proving the interest of this method on a clinical use -case. - ---------------- - -### 12 Jul 2021 | [A Systematic Literature Review of Automated ICD Coding and Classification Systems using Discharge Summaries](https://arxiv.org/abs/2107.10652) | [⬇️](https://arxiv.org/pdf/2107.10652) -*Rajvir Kaur, Jeewani Anupama Ginige and Oliver Obst* - - Codification of free-text clinical narratives have long been recognised to be -beneficial for secondary uses such as funding, insurance claim processing and -research. The current scenario of assigning codes is a manual process which is -very expensive, time-consuming and error prone. In recent years, many -researchers have studied the use of Natural Language Processing (NLP), related -Machine Learning (ML) and Deep Learning (DL) methods and techniques to resolve -the problem of manual coding of clinical narratives and to assist human coders -to assign clinical codes more accurately and efficiently. This systematic -literature review provides a comprehensive overview of automated clinical -coding systems that utilises appropriate NLP, ML and DL methods and techniques -to assign ICD codes to discharge summaries. We have followed the Preferred -Reporting Items for Systematic Reviews and Meta-Analyses(PRISMA) guidelines and -conducted a comprehensive search of publications from January, 2010 to December -2020 in four academic databases- PubMed, ScienceDirect, Association for -Computing Machinery(ACM) Digital Library, and the Association for Computational -Linguistics(ACL) Anthology. We reviewed 7,556 publications; 38 met the -inclusion criteria. This review identified: datasets having discharge -summaries; NLP techniques along with some other data extraction processes, -different feature extraction and embedding techniques. To measure the -performance of classification methods, different evaluation metrics are used. -Lastly, future research directions are provided to scholars who are interested -in automated ICD code assignment. Efforts are still required to improve ICD -code prediction accuracy, availability of large-scale de-identified clinical -corpora with the latest version of the classification system. This can be a -platform to guide and share knowledge with the less experienced coders and -researchers. - ---------------- - -### 25 Mar 2017 | [Comparing Rule-Based and Deep Learning Models for Patient Phenotyping](https://arxiv.org/abs/1703.08705) | [⬇️](https://arxiv.org/pdf/1703.08705) -*Sebastian Gehrmann, Franck Dernoncourt, Yeran Li, Eric T. Carlson, Joy T. Wu, Jonathan Welt, John Foote Jr., Edward T. Moseley, David W. Grant, Patrick D. Tyler, Leo Anthony Celi* - - Objective: We investigate whether deep learning techniques for natural -language processing (NLP) can be used efficiently for patient phenotyping. -Patient phenotyping is a classification task for determining whether a patient -has a medical condition, and is a crucial part of secondary analysis of -healthcare data. We assess the performance of deep learning algorithms and -compare them with classical NLP approaches. - Materials and Methods: We compare convolutional neural networks (CNNs), -n-gram models, and approaches based on cTAKES that extract pre-defined medical -concepts from clinical notes and use them to predict patient phenotypes. The -performance is tested on 10 different phenotyping tasks using 1,610 discharge -summaries extracted from the MIMIC-III database. - Results: CNNs outperform other phenotyping algorithms in all 10 tasks. The -average F1-score of our model is 76 (PPV of 83, and sensitivity of 71) with our -model having an F1-score up to 37 points higher than alternative approaches. We -additionally assess the interpretability of our model by presenting a method -that extracts the most salient phrases for a particular prediction. - Conclusion: We show that NLP methods based on deep learning improve the -performance of patient phenotyping. Our CNN-based algorithm automatically -learns the phrases associated with each patient phenotype. As such, it reduces -the annotation complexity for clinical domain experts, who are normally -required to develop task-specific annotation rules and identify relevant -phrases. Our method performs well in terms of both performance and -interpretability, which indicates that deep learning is an effective approach -to patient phenotyping based on clinicians' notes. - ---------------- - - - - -# 5 - - - -1. 🔍 Transformer Models for Natural Language Processing (NLP) in Telehealth -2. 🤖 Generative Adversarial Networks (GANs) for Medical Image Analysis -3. 🧠 Deep Reinforcement Learning for Clinical Decision Support Systems - -## Keyword Glossary with Emojis - -- 💻 Machine Learning -- 🌐 Natural Language Processing -- 🖼️ Computer Vision -- 🧠 Deep Learning -- 🔬 Medical Imaging -- 💊 Drug Discovery -- 🩺 Clinical Decision Support - -```mermaid -graph LR - A[Machine Learning] --> B[Natural Language Processing] - A --> C[Computer Vision] - A --> D[Deep Learning] - D --> E[Medical Imaging] - D --> F[Drug Discovery] - D --> G[Clinical Decision Support] -``` - -```python -# app.py -import streamlit as st - -st.title("State-of-the-Art Techniques for AI in Telehealth and Healthcare") - -st.header("Outline with Emojis") -st.write("1. 🔍 Transformer Models for Natural Language Processing (NLP) in Telehealth") -st.write("2. 🤖 Generative Adversarial Networks (GANs) for Medical Image Analysis") -st.write("3. 🧠 Deep Reinforcement Learning for Clinical Decision Support Systems") - -st.header("Keyword Glossary with Emojis") -st.write("- 💻 Machine Learning") -st.write("- 🌐 Natural Language Processing") -st.write("- 🖼️ Computer Vision") -st.write("- 🧠 Deep Learning") -st.write("- 🔬 Medical Imaging") -st.write("- 💊 Drug Discovery") -st.write("- 🩺 Clinical Decision Support") - -st.header("Mermaid Diagram") -mermaid_code = """ -graph LR - A[Machine Learning] --> B[Natural Language Processing] - A --> C[Computer Vision] - A --> D[Deep Learning] - D --> E[Medical Imaging] - D --> F[Drug Discovery] - D --> G[Clinical Decision Support] -""" -st.mermaid(mermaid_code) -``` - - -# 🩺🔍 Search Results -### 15 Sep 2022 | [Multi-Modal Masked Autoencoders for Medical Vision-and-Language Pre-Training](https://arxiv.org/abs/2209.07098) | [⬇️](https://arxiv.org/pdf/2209.07098) -*Zhihong Chen, Yuhao Du, Jinpeng Hu, Yang Liu, Guanbin Li, Xiang Wan, Tsung-Hui Chang* - - Medical vision-and-language pre-training provides a feasible solution to -extract effective vision-and-language representations from medical images and -texts. However, few studies have been dedicated to this field to facilitate -medical vision-and-language understanding. In this paper, we propose a -self-supervised learning paradigm with multi-modal masked autoencoders -(M$^3$AE), which learn cross-modal domain knowledge by reconstructing missing -pixels and tokens from randomly masked images and texts. There are three key -designs to make this simple approach work. First, considering the different -information densities of vision and language, we adopt different masking ratios -for the input image and text, where a considerably larger masking ratio is used -for images. Second, we use visual and textual features from different layers to -perform the reconstruction to deal with different levels of abstraction in -visual and language. Third, we develop different designs for vision and -language decoders (i.e., a Transformer for vision and a multi-layer perceptron -for language). To perform a comprehensive evaluation and facilitate further -research, we construct a medical vision-and-language benchmark including three -tasks. Experimental results demonstrate the effectiveness of our approach, -where state-of-the-art results are achieved on all downstream tasks. Besides, -we conduct further analysis to better verify the effectiveness of different -components of our approach and various settings of pre-training. The source -code is available at~\url{https://github.com/zhjohnchan/M3AE}. - ---------------- - -### 05 Jan 2021 | [Integration of Domain Knowledge using Medical Knowledge Graph Deep Learning for Cancer Phenotyping](https://arxiv.org/abs/2101.01337) | [⬇️](https://arxiv.org/pdf/2101.01337) -*Mohammed Alawad, Shang Gao, Mayanka Chandra Shekar, S.M.Shamimul Hasan, J. Blair Christian, Xiao-Cheng Wu, Eric B. Durbin, Jennifer Doherty, Antoinette Stroup, Linda Coyle, Lynne Penberthy, Georgia Tourassi* - - A key component of deep learning (DL) for natural language processing (NLP) -is word embeddings. Word embeddings that effectively capture the meaning and -context of the word that they represent can significantly improve the -performance of downstream DL models for various NLP tasks. Many existing word -embeddings techniques capture the context of words based on word co-occurrence -in documents and text; however, they often cannot capture broader -domain-specific relationships between concepts that may be crucial for the NLP -task at hand. In this paper, we propose a method to integrate external -knowledge from medical terminology ontologies into the context captured by word -embeddings. Specifically, we use a medical knowledge graph, such as the unified -medical language system (UMLS), to find connections between clinical terms in -cancer pathology reports. This approach aims to minimize the distance between -connected clinical concepts. We evaluate the proposed approach using a -Multitask Convolutional Neural Network (MT-CNN) to extract six cancer -characteristics -- site, subsite, laterality, behavior, histology, and grade -- -from a dataset of ~900K cancer pathology reports. The results show that the -MT-CNN model which uses our domain informed embeddings outperforms the same -MT-CNN using standard word2vec embeddings across all tasks, with an improvement -in the overall micro- and macro-F1 scores by 4.97\%and 22.5\%, respectively. - ---------------- - -### 11 Dec 2023 | [Generative Large Language Models Are All-purpose Text Analytics Engines: Text-to-text Learning Is All Your Need](https://arxiv.org/abs/2312.06099) | [⬇️](https://arxiv.org/pdf/2312.06099) -*Cheng Peng, Xi Yang, Aokun Chen, Zehao Yu, Kaleb E Smith, Anthony B Costa, Mona G Flores, Jiang Bian, Yonghui Wu* - - Objective To solve major clinical natural language processing (NLP) tasks -using a unified text-to-text learning architecture based on a generative large -language model (LLM) via prompt tuning. Methods We formulated 7 key clinical -NLP tasks as text-to-text learning and solved them using one unified generative -clinical LLM, GatorTronGPT, developed using GPT-3 architecture and trained with -up to 20 billion parameters. We adopted soft prompts (i.e., trainable vectors) -with frozen LLM, where the LLM parameters were not updated (i.e., frozen) and -only the vectors of soft prompts were updated, known as prompt tuning. We added -additional soft prompts as a prefix to the input layer, which were optimized -during the prompt tuning. We evaluated the proposed method using 7 clinical NLP -tasks and compared them with previous task-specific solutions based on -Transformer models. Results and Conclusion The proposed approach achieved -state-of-the-art performance for 5 out of 7 major clinical NLP tasks using one -unified generative LLM. Our approach outperformed previous task-specific -transformer models by ~3% for concept extraction and 7% for relation extraction -applied to social determinants of health, 3.4% for clinical concept -normalization, 3.4~10% for clinical abbreviation disambiguation, and 5.5~9% for -natural language inference. Our approach also outperformed a previously -developed prompt-based machine reading comprehension (MRC) model, -GatorTron-MRC, for clinical concept and relation extraction. The proposed -approach can deliver the ``one model for all`` promise from training to -deployment using a unified generative LLM. - ---------------- - -### 27 Jan 2023 | [A Multi-View Joint Learning Framework for Embedding Clinical Codes and Text Using Graph Neural Networks](https://arxiv.org/abs/2301.11608) | [⬇️](https://arxiv.org/pdf/2301.11608) -*Lecheng Kong, Christopher King, Bradley Fritz, Yixin Chen* - - Learning to represent free text is a core task in many clinical machine -learning (ML) applications, as clinical text contains observations and plans -not otherwise available for inference. State-of-the-art methods use large -language models developed with immense computational resources and training -data; however, applying these models is challenging because of the highly -varying syntax and vocabulary in clinical free text. Structured information -such as International Classification of Disease (ICD) codes often succinctly -abstracts the most important facts of a clinical encounter and yields good -performance, but is often not as available as clinical text in real-world -scenarios. We propose a \textbf{multi-view learning framework} that jointly -learns from codes and text to combine the availability and forward-looking -nature of text and better performance of ICD codes. The learned text embeddings -can be used as inputs to predictive algorithms independent of the ICD codes -during inference. Our approach uses a Graph Neural Network (GNN) to process ICD -codes, and Bi-LSTM to process text. We apply Deep Canonical Correlation -Analysis (DCCA) to enforce the two views to learn a similar representation of -each patient. In experiments using planned surgical procedure text, our model -outperforms BERT models fine-tuned to clinical data, and in experiments using -diverse text in MIMIC-III, our model is competitive to a fine-tuned BERT at a -tiny fraction of its computational effort. - ---------------- - -### 17 Feb 2023 | [Towards Unifying Medical Vision-and-Language Pre-training via Soft Prompts](https://arxiv.org/abs/2302.08958) | [⬇️](https://arxiv.org/pdf/2302.08958) -*Zhihong Chen, Shizhe Diao, Benyou Wang, Guanbin Li, Xiang Wan* - - Medical vision-and-language pre-training (Med-VLP) has shown promising -improvements on many downstream medical tasks owing to its applicability to -extracting generic representations from medical images and texts. Practically, -there exist two typical types, \textit{i.e.}, the fusion-encoder type and the -dual-encoder type, depending on whether a heavy fusion module is used. The -former is superior at multi-modal tasks owing to the sufficient interaction -between modalities; the latter is good at uni-modal and cross-modal tasks due -to the single-modality encoding ability. To take advantage of these two types, -we propose an effective yet straightforward scheme named PTUnifier to unify the -two types. We first unify the input format by introducing visual and textual -prompts, which serve as a feature bank that stores the most representative -images/texts. By doing so, a single model could serve as a \textit{foundation -model} that processes various tasks adopting different input formats -(\textit{i.e.}, image-only, text-only, and image-text-pair). Furthermore, we -construct a prompt pool (instead of static ones) to improve diversity and -scalability. Experimental results show that our approach achieves -state-of-the-art results on a broad range of tasks, spanning uni-modal tasks -(\textit{i.e.}, image/text classification and text summarization), cross-modal -tasks (\textit{i.e.}, image-to-text generation and image-text/text-image -retrieval), and multi-modal tasks (\textit{i.e.}, visual question answering), -demonstrating the effectiveness of our approach. Note that the adoption of -prompts is orthogonal to most existing Med-VLP approaches and could be a -beneficial and complementary extension to these approaches. - ---------------- - -### 18 Apr 2019 | [Learning to Collocate Neural Modules for Image Captioning](https://arxiv.org/abs/1904.08608) | [⬇️](https://arxiv.org/pdf/1904.08608) -*Xu Yang, Hanwang Zhang, Jianfei Cai* - - We do not speak word by word from scratch; our brain quickly structures a -pattern like \textsc{sth do sth at someplace} and then fill in the detailed -descriptions. To render existing encoder-decoder image captioners such -human-like reasoning, we propose a novel framework: learning to Collocate -Neural Modules (CNM), to generate the `inner pattern' connecting visual encoder -and language decoder. Unlike the widely-used neural module networks in visual -Q\&A, where the language (ie, question) is fully observable, CNM for captioning -is more challenging as the language is being generated and thus is partially -observable. To this end, we make the following technical contributions for CNM -training: 1) compact module design --- one for function words and three for -visual content words (eg, noun, adjective, and verb), 2) soft module fusion and -multi-step module execution, robustifying the visual reasoning in partial -observation, 3) a linguistic loss for module controller being faithful to -part-of-speech collocations (eg, adjective is before noun). Extensive -experiments on the challenging MS-COCO image captioning benchmark validate the -effectiveness of our CNM image captioner. In particular, CNM achieves a new -state-of-the-art 127.9 CIDEr-D on Karpathy split and a single-model 126.0 c40 -on the official server. CNM is also robust to few training samples, eg, by -training only one sentence per image, CNM can halve the performance loss -compared to a strong baseline. - ---------------- - -### 22 Sep 2023 | [Large Language Models and Control Mechanisms Improve Text Readability of Biomedical Abstracts](https://arxiv.org/abs/2309.13202) | [⬇️](https://arxiv.org/pdf/2309.13202) -*Zihao Li, Samuel Belkadi, Nicolo Micheletti, Lifeng Han, Matthew Shardlow, Goran Nenadic* - - Biomedical literature often uses complex language and inaccessible -professional terminologies. That is why simplification plays an important role -in improving public health literacy. Applying Natural Language Processing (NLP) -models to automate such tasks allows for quick and direct accessibility for lay -readers. In this work, we investigate the ability of state-of-the-art large -language models (LLMs) on the task of biomedical abstract simplification, using -the publicly available dataset for plain language adaptation of biomedical -abstracts (\textbf{PLABA}). The methods applied include domain fine-tuning and -prompt-based learning (PBL) on: 1) Encoder-decoder models (T5, SciFive, and -BART), 2) Decoder-only GPT models (GPT-3.5 and GPT-4) from OpenAI and BioGPT, -and 3) Control-token mechanisms on BART-based models. We used a range of -automatic evaluation metrics, including BLEU, ROUGE, SARI, and BERTscore, and -also conducted human evaluations. BART-Large with Control Token (BART-L-w-CT) -mechanisms reported the highest SARI score of 46.54 and T5-base reported the -highest BERTscore 72.62. In human evaluation, BART-L-w-CTs achieved a better -simplicity score over T5-Base (2.9 vs. 2.2), while T5-Base achieved a better -meaning preservation score over BART-L-w-CTs (3.1 vs. 2.6). We also categorised -the system outputs with examples, hoping this will shed some light for future -research on this task. Our code, fine-tuned models, and data splits are -available at \url{https://github.com/HECTA-UoM/PLABA-MU} - ---------------- - -### 05 Nov 2019 | [Integrating Dictionary Feature into A Deep Learning Model for Disease Named Entity Recognition](https://arxiv.org/abs/1911.01600) | [⬇️](https://arxiv.org/pdf/1911.01600) -*Hamada A. Nayel and Shashrekha H. L* - - In recent years, Deep Learning (DL) models are becoming important due to -their demonstrated success at overcoming complex learning problems. DL models -have been applied effectively for different Natural Language Processing (NLP) -tasks such as part-of-Speech (PoS) tagging and Machine Translation (MT). -Disease Named Entity Recognition (Disease-NER) is a crucial task which aims at -extracting disease Named Entities (NEs) from text. In this paper, a DL model -for Disease-NER using dictionary information is proposed and evaluated on -National Center for Biotechnology Information (NCBI) disease corpus and BC5CDR -dataset. Word embeddings trained over general domain texts as well as -biomedical texts have been used to represent input to the proposed model. This -study also compares two different Segment Representation (SR) schemes, namely -IOB2 and IOBES for Disease-NER. The results illustrate that using dictionary -information, pre-trained word embeddings, character embeddings and CRF with -global score improves the performance of Disease-NER system. - ---------------- - -### 24 Apr 2023 | [Learning to Collocate Visual-Linguistic Neural Modules for Image Captioning](https://arxiv.org/abs/2210.01338) | [⬇️](https://arxiv.org/pdf/2210.01338) -*Xu Yang and Hanwang Zhang and Chongyang Gao and Jianfei Cai* - - Humans tend to decompose a sentence into different parts like \textsc{sth do -sth at someplace} and then fill each part with certain content. Inspired by -this, we follow the \textit{principle of modular design} to propose a novel -image captioner: learning to Collocate Visual-Linguistic Neural Modules -(CVLNM). Unlike the \re{widely used} neural module networks in VQA, where the -language (\ie, question) is fully observable, \re{the task of collocating -visual-linguistic modules is more challenging.} This is because the language is -only partially observable, for which we need to dynamically collocate the -modules during the process of image captioning. To sum up, we make the -following technical contributions to design and train our CVLNM: 1) -\textit{distinguishable module design} -- \re{four modules in the encoder} -including one linguistic module for function words and three visual modules for -different content words (\ie, noun, adjective, and verb) and another linguistic -one in the decoder for commonsense reasoning, 2) a self-attention based -\textit{module controller} for robustifying the visual reasoning, 3) a -part-of-speech based \textit{syntax loss} imposed on the module controller for -further regularizing the training of our CVLNM. Extensive experiments on the -MS-COCO dataset show that our CVLNM is more effective, \eg, achieving a new -state-of-the-art 129.5 CIDEr-D, and more robust, \eg, being less likely to -overfit to dataset bias and suffering less when fewer training samples are -available. Codes are available at \url{https://github.com/GCYZSL/CVLMN} - ---------------- - -### 10 Feb 2014 | [\'Etude cognitive des processus de construction d'une requ\^ete dans un syst\`eme de gestion de connaissances m\'edicales](https://arxiv.org/abs/1402.2562) | [⬇️](https://arxiv.org/pdf/1402.2562) -*Nathalie Chaignaud (LITIS), Val\'erie Delavigne (LiDiFra), Maryvonne Holzem (LiDiFra), Jean-Philippe Kotowicz (LITIS), Alain Loisel (LITIS)* - - This article presents the Cogni-CISMeF project, which aims at improving -medical information search in the CISMeF system (Catalog and Index of -French-language health resources) by including a conversational agent to -interact with the user in natural language. To study the cognitive processes -involved during the information search, a bottom-up methodology was adopted. -Experimentation has been set up to obtain human dialogs between a user (playing -the role of patient) dealing with medical information search and a CISMeF -expert refining the request. The analysis of these dialogs underlined the use -of discursive evidence: vocabulary, reformulation, implicit or explicit -expression of user intentions, conversational sequences, etc. A model of -artificial agent is proposed. It leads the user in its information search by -proposing to him examples, assistance and choices. This model was implemented -and integrated in the CISMeF system. ---- Cet article d\'ecrit le projet -Cogni-CISMeF qui propose un module de dialogue Homme-Machine \`a int\'egrer -dans le syst\`eme d'indexation de connaissances m\'edicales CISMeF (Catalogue -et Index des Sites M\'edicaux Francophones). Nous avons adopt\'e une d\'emarche -de mod\'elisation cognitive en proc\'edant \`a un recueil de corpus de -dialogues entre un utilisateur (jouant le r\^ole d'un patient) d\'esirant une -information m\'edicale et un expert CISMeF af inant cette demande pour -construire la requ\^ete. Nous avons analys\'e la structure des dialogues ainsi -obtenus et avons \'etudi\'e un certain nombre d'indices discursifs : -vocabulaire employ\'e, marques de reformulation, commentaires m\'eta et -\'epilinguistiques, expression implicite ou explicite des intentions de -l'utilisateur, encha\^inement conversationnel, etc. De cette analyse, nous -avons construit un mod\`ele d'agent artificiel dot\'e de capacit\'es cognitives -capables d'aider l'utilisateur dans sa t\^ache de recherche d'information. Ce -mod\`ele a \'et\'e impl\'ement\'e et int\'egr\'e dans le syst\`eme CISMeF. - ---------------- - -### 14 Sep 2022 | [Summarizing Patients Problems from Hospital Progress Notes Using Pre-trained Sequence-to-Sequence Models](https://arxiv.org/abs/2208.08408) | [⬇️](https://arxiv.org/pdf/2208.08408) -*Yanjun Gao, Dmitriy Dligach, Timothy Miller, Dongfang Xu, Matthew M. Churpek, Majid Afshar* - - Automatically summarizing patients' main problems from daily progress notes -using natural language processing methods helps to battle against information -and cognitive overload in hospital settings and potentially assists providers -with computerized diagnostic decision support. Problem list summarization -requires a model to understand, abstract, and generate clinical documentation. -In this work, we propose a new NLP task that aims to generate a list of -problems in a patient's daily care plan using input from the provider's -progress notes during hospitalization. We investigate the performance of T5 and -BART, two state-of-the-art seq2seq transformer architectures, in solving this -problem. We provide a corpus built on top of progress notes from publicly -available electronic health record progress notes in the Medical Information -Mart for Intensive Care (MIMIC)-III. T5 and BART are trained on general domain -text, and we experiment with a data augmentation method and a domain adaptation -pre-training method to increase exposure to medical vocabulary and knowledge. -Evaluation methods include ROUGE, BERTScore, cosine similarity on sentence -embedding, and F-score on medical concepts. Results show that T5 with domain -adaptive pre-training achieves significant performance gains compared to a -rule-based system and general domain pre-trained language models, indicating a -promising direction for tackling the problem summarization task. - ---------------- - -### 04 Jan 2024 | [Text2MDT: Extracting Medical Decision Trees from Medical Texts](https://arxiv.org/abs/2401.02034) | [⬇️](https://arxiv.org/pdf/2401.02034) -*Wei Zhu and Wenfeng Li and Xing Tian and Pengfei Wang and Xiaoling Wang and Jin Chen and Yuanbin Wu and Yuan Ni and Guotong Xie* - - Knowledge of the medical decision process, which can be modeled as medical -decision trees (MDTs), is critical to build clinical decision support systems. -However, the current MDT construction methods rely heavily on time-consuming -and laborious manual annotation. In this work, we propose a novel task, -Text2MDT, to explore the automatic extraction of MDTs from medical texts such -as medical guidelines and textbooks. We normalize the form of the MDT and -create an annotated Text-to-MDT dataset in Chinese with the participation of -medical experts. We investigate two different methods for the Text2MDT tasks: -(a) an end-to-end framework which only relies on a GPT style large language -models (LLM) instruction tuning to generate all the node information and tree -structures. (b) The pipeline framework which decomposes the Text2MDT task to -three subtasks. Experiments on our Text2MDT dataset demonstrate that: (a) the -end-to-end method basd on LLMs (7B parameters or larger) show promising -results, and successfully outperform the pipeline methods. (b) The -chain-of-thought (COT) prompting method \cite{Wei2022ChainOT} can improve the -performance of the fine-tuned LLMs on the Text2MDT test set. (c) the -lightweight pipelined method based on encoder-based pretrained models can -perform comparably with LLMs with model complexity two magnititudes smaller. -Our Text2MDT dataset is open-sourced at -\url{https://tianchi.aliyun.com/dataset/95414}, and the source codes are -open-sourced at \url{https://github.com/michael-wzhu/text2dt}. - ---------------- - -### 03 Jun 2021 | [UmlsBERT: Clinical Domain Knowledge Augmentation of Contextual Embeddings Using the Unified Medical Language System Metathesaurus](https://arxiv.org/abs/2010.10391) | [⬇️](https://arxiv.org/pdf/2010.10391) -*George Michalopoulos, Yuanxin Wang, Hussam Kaka, Helen Chen and Alexander Wong* - - Contextual word embedding models, such as BioBERT and Bio_ClinicalBERT, have -achieved state-of-the-art results in biomedical natural language processing -tasks by focusing their pre-training process on domain-specific corpora. -However, such models do not take into consideration expert domain knowledge. - In this work, we introduced UmlsBERT, a contextual embedding model that -integrates domain knowledge during the pre-training process via a novel -knowledge augmentation strategy. More specifically, the augmentation on -UmlsBERT with the Unified Medical Language System (UMLS) Metathesaurus was -performed in two ways: i) connecting words that have the same underlying -`concept' in UMLS, and ii) leveraging semantic group knowledge in UMLS to -create clinically meaningful input embeddings. By applying these two -strategies, UmlsBERT can encode clinical domain knowledge into word embeddings -and outperform existing domain-specific models on common named-entity -recognition (NER) and clinical natural language inference clinical NLP tasks. - ---------------- - -### 15 Sep 2022 | [Align, Reason and Learn: Enhancing Medical Vision-and-Language Pre-training with Knowledge](https://arxiv.org/abs/2209.07118) | [⬇️](https://arxiv.org/pdf/2209.07118) -*Zhihong Chen, Guanbin Li, Xiang Wan* - - Medical vision-and-language pre-training (Med-VLP) has received considerable -attention owing to its applicability to extracting generic vision-and-language -representations from medical images and texts. Most existing methods mainly -contain three elements: uni-modal encoders (i.e., a vision encoder and a -language encoder), a multi-modal fusion module, and pretext tasks, with few -studies considering the importance of medical domain expert knowledge and -explicitly exploiting such knowledge to facilitate Med-VLP. Although there -exist knowledge-enhanced vision-and-language pre-training (VLP) methods in the -general domain, most require off-the-shelf toolkits (e.g., object detectors and -scene graph parsers), which are unavailable in the medical domain. In this -paper, we propose a systematic and effective approach to enhance Med-VLP by -structured medical knowledge from three perspectives. First, considering -knowledge can be regarded as the intermediate medium between vision and -language, we align the representations of the vision encoder and the language -encoder through knowledge. Second, we inject knowledge into the multi-modal -fusion model to enable the model to perform reasoning using knowledge as the -supplementation of the input image and text. Third, we guide the model to put -emphasis on the most critical information in images and texts by designing -knowledge-induced pretext tasks. To perform a comprehensive evaluation and -facilitate further research, we construct a medical vision-and-language -benchmark including three tasks. Experimental results illustrate the -effectiveness of our approach, where state-of-the-art performance is achieved -on all downstream tasks. Further analyses explore the effects of different -components of our approach and various settings of pre-training. - ---------------- - -### 21 Oct 2019 | [Two Case Studies of Experience Prototyping Machine Learning Systems in the Wild](https://arxiv.org/abs/1910.09137) | [⬇️](https://arxiv.org/pdf/1910.09137) -*Qian Yang* - - Throughout the course of my Ph.D., I have been designing the user experience -(UX) of various machine learning (ML) systems. In this workshop, I share two -projects as case studies in which people engage with ML in much more -complicated and nuanced ways than the technical HCML work might assume. The -first case study describes how cardiology teams in three hospitals used a -clinical decision-support system that helps them decide whether and when to -implant an artificial heart to a heart failure patient. I demonstrate that -physicians cannot draw on their decision-making experience by seeing only -patient data on paper. They are also confused by some fundamental premises upon -which ML operates. For example, physicians asked: Are ML predictions made based -on clinicians' best efforts? Is it ethical to make decisions based on previous -patients' collective outcomes? In the second case study, my collaborators and I -designed an intelligent text editor, with the goal of improving authors' -writing experience with NLP (Natural Language Processing) technologies. We -prototyped a number of generative functionalities where the system provides -phrase-or-sentence-level writing suggestions upon user request. When writing -with the prototype, however, authors shared that they need to "see where the -sentence is going two paragraphs later" in order to decide whether the -suggestion aligns with their writing; Some even considered adopting machine -suggestions as plagiarism, therefore "is simply wrong". - By sharing these unexpected and intriguing responses from these real-world ML -users, I hope to start a discussion about such previously-unknown complexities -and nuances of -- as the workshop proposal states -- "putting ML at the service -of people in a way that is accessible, useful, and trustworthy to all". - ---------------- - -### 22 Aug 2020 | [Symbolic Semantic Segmentation and Interpretation of COVID-19 Lung Infections in Chest CT volumes based on Emergent Languages](https://arxiv.org/abs/2008.09866) | [⬇️](https://arxiv.org/pdf/2008.09866) -*Aritra Chowdhury, Alberto Santamaria-Pang, James R. Kubricht, Jianwei Qiu, Peter Tu* - - The coronavirus disease (COVID-19) has resulted in a pandemic crippling the a -breadth of services critical to daily life. Segmentation of lung infections in -computerized tomography (CT) slices could be be used to improve diagnosis and -understanding of COVID-19 in patients. Deep learning systems lack -interpretability because of their black box nature. Inspired by human -communication of complex ideas through language, we propose a symbolic -framework based on emergent languages for the segmentation of COVID-19 -infections in CT scans of lungs. We model the cooperation between two -artificial agents - a Sender and a Receiver. These agents synergistically -cooperate using emergent symbolic language to solve the task of semantic -segmentation. Our game theoretic approach is to model the cooperation between -agents unlike Generative Adversarial Networks (GANs). The Sender retrieves -information from one of the higher layers of the deep network and generates a -symbolic sentence sampled from a categorical distribution of vocabularies. The -Receiver ingests the stream of symbols and cogenerates the segmentation mask. A -private emergent language is developed that forms the communication channel -used to describe the task of segmentation of COVID infections. We augment -existing state of the art semantic segmentation architectures with our symbolic -generator to form symbolic segmentation models. Our symbolic segmentation -framework achieves state of the art performance for segmentation of lung -infections caused by COVID-19. Our results show direct interpretation of -symbolic sentences to discriminate between normal and infected regions, -infection morphology and image characteristics. We show state of the art -results for segmentation of COVID-19 lung infections in CT. - ---------------- - -### 25 Jan 2021 | [PhenoTagger: A Hybrid Method for Phenotype Concept Recognition using Human Phenotype Ontology](https://arxiv.org/abs/2009.08478) | [⬇️](https://arxiv.org/pdf/2009.08478) -*Ling Luo, Shankai Yan, Po-Ting Lai, Daniel Veltri, Andrew Oler, Sandhya Xirasagar, Rajarshi Ghosh, Morgan Similuk, Peter N. Robinson, Zhiyong Lu* - - Automatic phenotype concept recognition from unstructured text remains a -challenging task in biomedical text mining research. Previous works that -address the task typically use dictionary-based matching methods, which can -achieve high precision but suffer from lower recall. Recently, machine -learning-based methods have been proposed to identify biomedical concepts, -which can recognize more unseen concept synonyms by automatic feature learning. -However, most methods require large corpora of manually annotated data for -model training, which is difficult to obtain due to the high cost of human -annotation. In this paper, we propose PhenoTagger, a hybrid method that -combines both dictionary and machine learning-based methods to recognize Human -Phenotype Ontology (HPO) concepts in unstructured biomedical text. We first use -all concepts and synonyms in HPO to construct a dictionary, which is then used -to automatically build a distantly supervised training dataset for machine -learning. Next, a cutting-edge deep learning model is trained to classify each -candidate phrase (n-gram from input sentence) into a corresponding concept -label. Finally, the dictionary and machine learning-based prediction results -are combined for improved performance. Our method is validated with two HPO -corpora, and the results show that PhenoTagger compares favorably to previous -methods. In addition, to demonstrate the generalizability of our method, we -retrained PhenoTagger using the disease ontology MEDIC for disease concept -recognition to investigate the effect of training on different ontologies. -Experimental results on the NCBI disease corpus show that PhenoTagger without -requiring manually annotated training data achieves competitive performance as -compared with state-of-the-art supervised methods. - ---------------- - -### 06 Oct 2016 | [A New Data Representation Based on Training Data Characteristics to Extract Drug Named-Entity in Medical Text](https://arxiv.org/abs/1610.01891) | [⬇️](https://arxiv.org/pdf/1610.01891) -*Sadikin Mujiono, Mohamad Ivan Fanany, Chan Basaruddin* - - One essential task in information extraction from the medical corpus is drug -name recognition. Compared with text sources come from other domains, the -medical text is special and has unique characteristics. In addition, the -medical text mining poses more challenges, e.g., more unstructured text, the -fast growing of new terms addition, a wide range of name variation for the same -drug. The mining is even more challenging due to the lack of labeled dataset -sources and external knowledge, as well as multiple token representations for a -single drug name that is more common in the real application setting. Although -many approaches have been proposed to overwhelm the task, some problems -remained with poor F-score performance (less than 0.75). This paper presents a -new treatment in data representation techniques to overcome some of those -challenges. We propose three data representation techniques based on the -characteristics of word distribution and word similarities as a result of word -embedding training. The first technique is evaluated with the standard NN -model, i.e., MLP (Multi-Layer Perceptrons). The second technique involves two -deep network classifiers, i.e., DBN (Deep Belief Networks), and SAE (Stacked -Denoising Encoders). The third technique represents the sentence as a sequence -that is evaluated with a recurrent NN model, i.e., LSTM (Long Short Term -Memory). In extracting the drug name entities, the third technique gives the -best F-score performance compared to the state of the art, with its average -F-score being 0.8645. - ---------------- - -### 21 Oct 2022 | [Doctors Handwritten Prescription Recognition System In Multi Language Using Deep Learning](https://arxiv.org/abs/2210.11666) | [⬇️](https://arxiv.org/pdf/2210.11666) -*Pavithiran G, Sharan Padmanabhan, Nuvvuru Divya, Aswathy V, Irene Jerusha P, Chandar B* - - Doctors typically write in incomprehensible handwriting, making it difficult -for both the general public and some pharmacists to understand the medications -they have prescribed. It is not ideal for them to write the prescription -quietly and methodically because they will be dealing with dozens of patients -every day and will be swamped with work.As a result, their handwriting is -illegible. This may result in reports or prescriptions consisting of short -forms and cursive writing that a typical person or pharmacist won't be able to -read properly, which will cause prescribed medications to be misspelled. -However, some individuals are accustomed to writing prescriptions in regional -languages because we all live in an area with a diversity of regional -languages. It makes analyzing the content much more challenging. So, in this -project, we'll use a recognition system to build a tool that can translate the -handwriting of physicians in any language. This system will be made into an -application which is fully autonomous in functioning. As the user uploads the -prescription image the program will pre-process the image by performing image -pre-processing, and word segmentations initially before processing the image -for training. And it will be done for every language we require the model to -detect. And as of the deduction model will be made using deep learning -techniques including CNN, RNN, and LSTM, which are utilized to train the model. -To match words from various languages that will be written in the system, -Unicode will be used. Furthermore, fuzzy search and market basket analysis are -employed to offer an end result that will be optimized from the pharmaceutical -database and displayed to the user as a structured output. - ---------------- - -### 16 Sep 2018 | [Using the Tsetlin Machine to Learn Human-Interpretable Rules for High-Accuracy Text Categorization with Medical Applications](https://arxiv.org/abs/1809.04547) | [⬇️](https://arxiv.org/pdf/1809.04547) -*Geir Thore Berge, Ole-Christoffer Granmo, Tor Oddbj{\o}rn Tveit, Morten Goodwin, Lei Jiao, Bernt Viggo Matheussen* - - Medical applications challenge today's text categorization techniques by -demanding both high accuracy and ease-of-interpretation. Although deep learning -has provided a leap ahead in accuracy, this leap comes at the sacrifice of -interpretability. To address this accuracy-interpretability challenge, we here -introduce, for the first time, a text categorization approach that leverages -the recently introduced Tsetlin Machine. In all brevity, we represent the terms -of a text as propositional variables. From these, we capture categories using -simple propositional formulae, such as: if "rash" and "reaction" and -"penicillin" then Allergy. The Tsetlin Machine learns these formulae from a -labelled text, utilizing conjunctive clauses to represent the particular facets -of each category. Indeed, even the absence of terms (negated features) can be -used for categorization purposes. Our empirical comparison with Na\"ive Bayes, -decision trees, linear support vector machines (SVMs), random forest, long -short-term memory (LSTM) neural networks, and other techniques, is quite -conclusive. The Tsetlin Machine either performs on par with or outperforms all -of the evaluated methods on both the 20 Newsgroups and IMDb datasets, as well -as on a non-public clinical dataset. On average, the Tsetlin Machine delivers -the best recall and precision scores across the datasets. Finally, our GPU -implementation of the Tsetlin Machine executes 5 to 15 times faster than the -CPU implementation, depending on the dataset. We thus believe that our novel -approach can have a significant impact on a wide range of text analysis -applications, forming a promising starting point for deeper natural language -understanding with the Tsetlin Machine. - ---------------- - - - - -# 6 - - - - -1. 🌐 Imitation Learning for Behavior Cloning - - Leveraging deep learning models to learn and mimic human behavior from demonstrations. - - Application: Telemed bot mimicking doctor's gestures and communication style. - -2. 🤖 Generative Adversarial Networks (GANs) for Mirroring - - Using GANs to generate mirrored behavior by learning the distribution of human actions. - - Example: AI assistant generating synchronized facial expressions and body language. - -3. 💡 Empathic AI for Shared World Modeling - - Developing AI systems that can understand and model human emotions, beliefs, and intentions. - - Application: AI assistant building rapport by mirroring and adapting to the user's emotional state. - -Keywords: - -1. 🔑 Imitation Learning -2. 🔑 Behavior Cloning -3. 🔑 Generative Adversarial Networks (GANs) -4. 🔑 Empathic AI -5. 🔑 Shared World Modeling -6. 🔑 Mirroring - -Mermaid Model: - -```mermaid -graph LR - A[Imitation Learning] --> B[Behavior Cloning] - B --> C[Mirroring] - D[Generative Adversarial Networks] --> C - E[Empathic AI] --> F[Shared World Modeling] - F --> C -``` - - -# 🩺🔍 Search Results -### 22 Aug 2020 | [Symbolic Semantic Segmentation and Interpretation of COVID-19 Lung Infections in Chest CT volumes based on Emergent Languages](https://arxiv.org/abs/2008.09866) | [⬇️](https://arxiv.org/pdf/2008.09866) -*Aritra Chowdhury, Alberto Santamaria-Pang, James R. Kubricht, Jianwei Qiu, Peter Tu* - - The coronavirus disease (COVID-19) has resulted in a pandemic crippling the a -breadth of services critical to daily life. Segmentation of lung infections in -computerized tomography (CT) slices could be be used to improve diagnosis and -understanding of COVID-19 in patients. Deep learning systems lack -interpretability because of their black box nature. Inspired by human -communication of complex ideas through language, we propose a symbolic -framework based on emergent languages for the segmentation of COVID-19 -infections in CT scans of lungs. We model the cooperation between two -artificial agents - a Sender and a Receiver. These agents synergistically -cooperate using emergent symbolic language to solve the task of semantic -segmentation. Our game theoretic approach is to model the cooperation between -agents unlike Generative Adversarial Networks (GANs). The Sender retrieves -information from one of the higher layers of the deep network and generates a -symbolic sentence sampled from a categorical distribution of vocabularies. The -Receiver ingests the stream of symbols and cogenerates the segmentation mask. A -private emergent language is developed that forms the communication channel -used to describe the task of segmentation of COVID infections. We augment -existing state of the art semantic segmentation architectures with our symbolic -generator to form symbolic segmentation models. Our symbolic segmentation -framework achieves state of the art performance for segmentation of lung -infections caused by COVID-19. Our results show direct interpretation of -symbolic sentences to discriminate between normal and infected regions, -infection morphology and image characteristics. We show state of the art -results for segmentation of COVID-19 lung infections in CT. - ---------------- - -### 04 Aug 2020 | [Towards Emergent Language Symbolic Semantic Segmentation and Model Interpretability](https://arxiv.org/abs/2007.09448) | [⬇️](https://arxiv.org/pdf/2007.09448) -*Alberto Santamaria-Pang, James Kubricht, Aritra Chowdhury, Chitresh Bhushan, Peter Tu* - - Recent advances in methods focused on the grounding problem have resulted in -techniques that can be used to construct a symbolic language associated with a -specific domain. Inspired by how humans communicate complex ideas through -language, we developed a generalized Symbolic Semantic ($\text{S}^2$) framework -for interpretable segmentation. Unlike adversarial models (e.g., GANs), we -explicitly model cooperation between two agents, a Sender and a Receiver, that -must cooperate to achieve a common goal. The Sender receives information from a -high layer of a segmentation network and generates a symbolic sentence derived -from a categorical distribution. The Receiver obtains the symbolic sentences -and co-generates the segmentation mask. In order for the model to converge, the -Sender and Receiver must learn to communicate using a private language. We -apply our architecture to segment tumors in the TCGA dataset. A UNet-like -architecture is used to generate input to the Sender network which produces a -symbolic sentence, and a Receiver network co-generates the segmentation mask -based on the sentence. Our Segmentation framework achieved similar or better -performance compared with state-of-the-art segmentation methods. In addition, -our results suggest direct interpretation of the symbolic sentences to -discriminate between normal and tumor tissue, tumor morphology, and other image -characteristics. - ---------------- - -### 23 Aug 2023 | [CgT-GAN: CLIP-guided Text GAN for Image Captioning](https://arxiv.org/abs/2308.12045) | [⬇️](https://arxiv.org/pdf/2308.12045) -*Jiarui Yu, Haoran Li, Yanbin Hao, Bin Zhu, Tong Xu and Xiangnan He* - - The large-scale visual-language pre-trained model, Contrastive Language-Image -Pre-training (CLIP), has significantly improved image captioning for scenarios -without human-annotated image-caption pairs. Recent advanced CLIP-based image -captioning without human annotations follows a text-only training paradigm, -i.e., reconstructing text from shared embedding space. Nevertheless, these -approaches are limited by the training/inference gap or huge storage -requirements for text embeddings. Given that it is trivial to obtain images in -the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates -images into the training process to enable the model to "see" real visual -modality. Particularly, we use adversarial training to teach CgT-GAN to mimic -the phrases of an external text corpus and CLIP-based reward to provide -semantic guidance. The caption generator is jointly rewarded based on the -caption naturalness to human language calculated from the GAN's discriminator -and the semantic guidance reward computed by the CLIP-based reward module. In -addition to the cosine similarity as the semantic guidance reward (i.e., -CLIP-cos), we further introduce a novel semantic guidance reward called -CLIP-agg, which aligns the generated caption with a weighted text embedding by -attentively aggregating the entire corpus. Experimental results on three -subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms -state-of-the-art methods significantly across all metrics. Code is available at -https://github.com/Lihr747/CgtGAN. - ---------------- - -### 31 Jan 2024 | [Image Anything: Towards Reasoning-coherent and Training-free Multi-modal Image Generation](https://arxiv.org/abs/2401.17664) | [⬇️](https://arxiv.org/pdf/2401.17664) -*Yuanhuiyi Lyu, Xu Zheng, Lin Wang* - - The multifaceted nature of human perception and comprehension indicates that, -when we think, our body can naturally take any combination of senses, a.k.a., -modalities and form a beautiful picture in our brain. For example, when we see -a cattery and simultaneously perceive the cat's purring sound, our brain can -construct a picture of a cat in the cattery. Intuitively, generative AI models -should hold the versatility of humans and be capable of generating images from -any combination of modalities efficiently and collaboratively. This paper -presents ImgAny, a novel end-to-end multi-modal generative model that can mimic -human reasoning and generate high-quality images. Our method serves as the -first attempt in its capacity of efficiently and flexibly taking any -combination of seven modalities, ranging from language, audio to vision -modalities, including image, point cloud, thermal, depth, and event data. Our -key idea is inspired by human-level cognitive processes and involves the -integration and harmonization of multiple input modalities at both the entity -and attribute levels without specific tuning across modalities. Accordingly, -our method brings two novel training-free technical branches: 1) Entity Fusion -Branch ensures the coherence between inputs and outputs. It extracts entity -features from the multi-modal representations powered by our specially -constructed entity knowledge graph; 2) Attribute Fusion Branch adeptly -preserves and processes the attributes. It efficiently amalgamates distinct -attributes from diverse input modalities via our proposed attribute knowledge -graph. Lastly, the entity and attribute features are adaptively fused as the -conditional inputs to the pre-trained Stable Diffusion model for image -generation. Extensive experiments under diverse modality combinations -demonstrate its exceptional capability for visual content creation. - ---------------- - -### 15 Oct 2022 | [GAMA: Generative Adversarial Multi-Object Scene Attacks](https://arxiv.org/abs/2209.09502) | [⬇️](https://arxiv.org/pdf/2209.09502) -*Abhishek Aich, Calvin-Khang Ta, Akash Gupta, Chengyu Song, Srikanth V. Krishnamurthy, M. Salman Asif, Amit K. Roy-Chowdhury* - - The majority of methods for crafting adversarial attacks have focused on -scenes with a single dominant object (e.g., images from ImageNet). On the other -hand, natural scenes include multiple dominant objects that are semantically -related. Thus, it is crucial to explore designing attack strategies that look -beyond learning on single-object scenes or attack single-object victim -classifiers. Due to their inherent property of strong transferability of -perturbations to unknown models, this paper presents the first approach of -using generative models for adversarial attacks on multi-object scenes. In -order to represent the relationships between different objects in the input -scene, we leverage upon the open-sourced pre-trained vision-language model CLIP -(Contrastive Language-Image Pre-training), with the motivation to exploit the -encoded semantics in the language space along with the visual space. We call -this attack approach Generative Adversarial Multi-object scene Attacks (GAMA). -GAMA demonstrates the utility of the CLIP model as an attacker's tool to train -formidable perturbation generators for multi-object scenes. Using the joint -image-text features to train the generator, we show that GAMA can craft potent -transferable perturbations in order to fool victim classifiers in various -attack settings. For example, GAMA triggers ~16% more misclassification than -state-of-the-art generative approaches in black-box settings where both the -classifier architecture and data distribution of the attacker are different -from the victim. Our code is available here: -https://abhishekaich27.github.io/gama.html - ---------------- - -### 18 Apr 2019 | [Learning to Collocate Neural Modules for Image Captioning](https://arxiv.org/abs/1904.08608) | [⬇️](https://arxiv.org/pdf/1904.08608) -*Xu Yang, Hanwang Zhang, Jianfei Cai* - - We do not speak word by word from scratch; our brain quickly structures a -pattern like \textsc{sth do sth at someplace} and then fill in the detailed -descriptions. To render existing encoder-decoder image captioners such -human-like reasoning, we propose a novel framework: learning to Collocate -Neural Modules (CNM), to generate the `inner pattern' connecting visual encoder -and language decoder. Unlike the widely-used neural module networks in visual -Q\&A, where the language (ie, question) is fully observable, CNM for captioning -is more challenging as the language is being generated and thus is partially -observable. To this end, we make the following technical contributions for CNM -training: 1) compact module design --- one for function words and three for -visual content words (eg, noun, adjective, and verb), 2) soft module fusion and -multi-step module execution, robustifying the visual reasoning in partial -observation, 3) a linguistic loss for module controller being faithful to -part-of-speech collocations (eg, adjective is before noun). Extensive -experiments on the challenging MS-COCO image captioning benchmark validate the -effectiveness of our CNM image captioner. In particular, CNM achieves a new -state-of-the-art 127.9 CIDEr-D on Karpathy split and a single-model 126.0 c40 -on the official server. CNM is also robust to few training samples, eg, by -training only one sentence per image, CNM can halve the performance loss -compared to a strong baseline. - ---------------- - -### 13 Nov 2018 | [Learning Segmentation Masks with the Independence Prior](https://arxiv.org/abs/1811.04682) | [⬇️](https://arxiv.org/pdf/1811.04682) -*Songmin Dai, Xiaoqiang Li, Lu Wang, Pin Wu, Weiqin Tong, Yimin Chen* - - An instance with a bad mask might make a composite image that uses it look -fake. This encourages us to learn segmentation by generating realistic -composite images. To achieve this, we propose a novel framework that exploits a -new proposed prior called the independence prior based on Generative -Adversarial Networks (GANs). The generator produces an image with multiple -category-specific instance providers, a layout module and a composition module. -Firstly, each provider independently outputs a category-specific instance image -with a soft mask. Then the provided instances' poses are corrected by the -layout module. Lastly, the composition module combines these instances into a -final image. Training with adversarial loss and penalty for mask area, each -provider learns a mask that is as small as possible but enough to cover a -complete category-specific instance. Weakly supervised semantic segmentation -methods widely use grouping cues modeling the association between image parts, -which are either artificially designed or learned with costly segmentation -labels or only modeled on local pairs. Unlike them, our method automatically -models the dependence between any parts and learns instance segmentation. We -apply our framework in two cases: (1) Foreground segmentation on -category-specific images with box-level annotation. (2) Unsupervised learning -of instance appearances and masks with only one image of homogeneous object -cluster (HOC). We get appealing results in both tasks, which shows the -independence prior is useful for instance segmentation and it is possible to -unsupervisedly learn instance masks with only one image. - ---------------- - -### 29 Dec 2020 | [Annotation-Efficient Learning for Medical Image Segmentation based on Noisy Pseudo Labels and Adversarial Learning](https://arxiv.org/abs/2012.14584) | [⬇️](https://arxiv.org/pdf/2012.14584) -*Lu Wang, Dong Guo, Guotai Wang and Shaoting Zhang* - - Despite that deep learning has achieved state-of-the-art performance for -medical image segmentation, its success relies on a large set of manually -annotated images for training that are expensive to acquire. In this paper, we -propose an annotation-efficient learning framework for segmentation tasks that -avoids annotations of training images, where we use an improved -Cycle-Consistent Generative Adversarial Network (GAN) to learn from a set of -unpaired medical images and auxiliary masks obtained either from a shape model -or public datasets. We first use the GAN to generate pseudo labels for our -training images under the implicit high-level shape constraint represented by a -Variational Auto-encoder (VAE)-based discriminator with the help of the -auxiliary masks, and build a Discriminator-guided Generator Channel Calibration -(DGCC) module which employs our discriminator's feedback to calibrate the -generator for better pseudo labels. To learn from the pseudo labels that are -noisy, we further introduce a noise-robust iterative learning method using -noise-weighted Dice loss. We validated our framework with two situations: -objects with a simple shape model like optic disc in fundus images and fetal -head in ultrasound images, and complex structures like lung in X-Ray images and -liver in CT images. Experimental results demonstrated that 1) Our VAE-based -discriminator and DGCC module help to obtain high-quality pseudo labels. 2) Our -proposed noise-robust learning method can effectively overcome the effect of -noisy pseudo labels. 3) The segmentation performance of our method without -using annotations of training images is close or even comparable to that of -learning from human annotations. - ---------------- - -### 10 Jul 2023 | [AmadeusGPT: a natural language interface for interactive animal behavioral analysis](https://arxiv.org/abs/2307.04858) | [⬇️](https://arxiv.org/pdf/2307.04858) -*Shaokai Ye, Jessy Lauer, Mu Zhou, Alexander Mathis, Mackenzie W. Mathis* - - The process of quantifying and analyzing animal behavior involves translating -the naturally occurring descriptive language of their actions into -machine-readable code. Yet, codifying behavior analysis is often challenging -without deep understanding of animal behavior and technical machine learning -knowledge. To limit this gap, we introduce AmadeusGPT: a natural language -interface that turns natural language descriptions of behaviors into -machine-executable code. Large-language models (LLMs) such as GPT3.5 and GPT4 -allow for interactive language-based queries that are potentially well suited -for making interactive behavior analysis. However, the comprehension capability -of these LLMs is limited by the context window size, which prevents it from -remembering distant conversations. To overcome the context window limitation, -we implement a novel dual-memory mechanism to allow communication between -short-term and long-term memory using symbols as context pointers for retrieval -and saving. Concretely, users directly use language-based definitions of -behavior and our augmented GPT develops code based on the core AmadeusGPT API, -which contains machine learning, computer vision, spatio-temporal reasoning, -and visualization modules. Users then can interactively refine results, and -seamlessly add new behavioral modules as needed. We benchmark AmadeusGPT and -show we can produce state-of-the-art performance on the MABE 2022 behavior -challenge tasks. Note, an end-user would not need to write any code to achieve -this. Thus, collectively AmadeusGPT presents a novel way to merge deep -biological knowledge, large-language models, and core computer vision modules -into a more naturally intelligent system. Code and demos can be found at: -https://github.com/AdaptiveMotorControlLab/AmadeusGPT. - ---------------- - -### 26 Nov 2019 | [Text2FaceGAN: Face Generation from Fine Grained Textual Descriptions](https://arxiv.org/abs/1911.11378) | [⬇️](https://arxiv.org/pdf/1911.11378) -*Osaid Rehman Nasir, Shailesh Kumar Jha, Manraj Singh Grover, Yi Yu, Ajit Kumar, Rajiv Ratn Shah* - - Powerful generative adversarial networks (GAN) have been developed to -automatically synthesize realistic images from text. However, most existing -tasks are limited to generating simple images such as flowers from captions. In -this work, we extend this problem to the less addressed domain of face -generation from fine-grained textual descriptions of face, e.g., "A person has -curly hair, oval face, and mustache". We are motivated by the potential of -automated face generation to impact and assist critical tasks such as criminal -face reconstruction. Since current datasets for the task are either very small -or do not contain captions, we generate captions for images in the CelebA -dataset by creating an algorithm to automatically convert a list of attributes -to a set of captions. We then model the highly multi-modal problem of text to -face generation as learning the conditional distribution of faces (conditioned -on text) in same latent space. We utilize the current state-of-the-art GAN -(DC-GAN with GAN-CLS loss) for learning conditional multi-modality. The -presence of more fine-grained details and variable length of the captions makes -the problem easier for a user but more difficult to handle compared to the -other text-to-image tasks. We flipped the labels for real and fake images and -added noise in discriminator. Generated images for diverse textual descriptions -show promising results. In the end, we show how the widely used inceptions -score is not a good metric to evaluate the performance of generative models -used for synthesizing faces from text. - ---------------- - -### 21 Oct 2019 | [A Survey and Taxonomy of Adversarial Neural Networks for Text-to-Image Synthesis](https://arxiv.org/abs/1910.09399) | [⬇️](https://arxiv.org/pdf/1910.09399) -*Jorge Agnese, Jonathan Herrera, Haicheng Tao, Xingquan Zhu* - - Text-to-image synthesis refers to computational methods which translate human -written textual descriptions, in the form of keywords or sentences, into images -with similar semantic meaning to the text. In earlier research, image synthesis -relied mainly on word to image correlation analysis combined with supervised -methods to find best alignment of the visual content matching to the text. -Recent progress in deep learning (DL) has brought a new set of unsupervised -deep learning methods, particularly deep generative models which are able to -generate realistic visual images using suitably trained neural network models. -In this paper, we review the most recent development in the text-to-image -synthesis research domain. Our survey first introduces image synthesis and its -challenges, and then reviews key concepts such as generative adversarial -networks (GANs) and deep convolutional encoder-decoder neural networks (DCNN). -After that, we propose a taxonomy to summarize GAN based text-to-image -synthesis into four major categories: Semantic Enhancement GANs, Resolution -Enhancement GANs, Diversity Enhancement GANS, and Motion Enhancement GANs. We -elaborate the main objective of each group, and further review typical GAN -architectures in each group. The taxonomy and the review outline the techniques -and the evolution of different approaches, and eventually provide a clear -roadmap to summarize the list of contemporaneous solutions that utilize GANs -and DCNNs to generate enthralling results in categories such as human faces, -birds, flowers, room interiors, object reconstruction from edge maps (games) -etc. The survey will conclude with a comparison of the proposed solutions, -challenges that remain unresolved, and future developments in the text-to-image -synthesis domain. - ---------------- - -### 24 Apr 2022 | [Emotion-Aware Transformer Encoder for Empathetic Dialogue Generation](https://arxiv.org/abs/2204.11320) | [⬇️](https://arxiv.org/pdf/2204.11320) -*Raman Goel, Seba Susan, Sachin Vashisht, and Armaan Dhanda* - - Modern day conversational agents are trained to emulate the manner in which -humans communicate. To emotionally bond with the user, these virtual agents -need to be aware of the affective state of the user. Transformers are the -recent state of the art in sequence-to-sequence learning that involves training -an encoder-decoder model with word embeddings from utterance-response pairs. We -propose an emotion-aware transformer encoder for capturing the emotional -quotient in the user utterance in order to generate human-like empathetic -responses. The contributions of our paper are as follows: 1) An emotion -detector module trained on the input utterances determines the affective state -of the user in the initial phase 2) A novel transformer encoder is proposed -that adds and normalizes the word embedding with emotion embedding thereby -integrating the semantic and affective aspects of the input utterance 3) The -encoder and decoder stacks belong to the Transformer-XL architecture which is -the recent state of the art in language modeling. Experimentation on the -benchmark Facebook AI empathetic dialogue dataset confirms the efficacy of our -model from the higher BLEU-4 scores achieved for the generated responses as -compared to existing methods. Emotionally intelligent virtual agents are now a -reality and inclusion of affect as a modality in all human-machine interfaces -is foreseen in the immediate future. - ---------------- - -### 25 Feb 2022 | [SGL: Symbolic Goal Learning in a Hybrid, Modular Framework for Human Instruction Following](https://arxiv.org/abs/2202.12912) | [⬇️](https://arxiv.org/pdf/2202.12912) -*Ruinian Xu and Hongyi Chen and Yunzhi Lin and Patricio A. Vela* - - This paper investigates robot manipulation based on human instruction with -ambiguous requests. The intent is to compensate for imperfect natural language -via visual observations. Early symbolic methods, based on manually defined -symbols, built modular framework consist of semantic parsing and task planning -for producing sequences of actions from natural language requests. Modern -connectionist methods employ deep neural networks to automatically learn visual -and linguistic features and map to a sequence of low-level actions, in an -endto-end fashion. These two approaches are blended to create a hybrid, modular -framework: it formulates instruction following as symbolic goal learning via -deep neural networks followed by task planning via symbolic planners. -Connectionist and symbolic modules are bridged with Planning Domain Definition -Language. The vision-and-language learning network predicts its goal -representation, which is sent to a planner for producing a task-completing -action sequence. For improving the flexibility of natural language, we further -incorporate implicit human intents with explicit human instructions. To learn -generic features for vision and language, we propose to separately pretrain -vision and language encoders on scene graph parsing and semantic textual -similarity tasks. Benchmarking evaluates the impacts of different components -of, or options for, the vision-and-language learning model and shows the -effectiveness of pretraining strategies. Manipulation experiments conducted in -the simulator AI2THOR show the robustness of the framework to novel scenarios. - ---------------- - -### 02 Dec 2020 | [A Framework and Dataset for Abstract Art Generation via CalligraphyGAN](https://arxiv.org/abs/2012.00744) | [⬇️](https://arxiv.org/pdf/2012.00744) -*Jinggang Zhuo, Ling Fan, Harry Jiannan Wang* - - With the advancement of deep learning, artificial intelligence (AI) has made -many breakthroughs in recent years and achieved superhuman performance in -various tasks such as object detection, reading comprehension, and video games. -Generative Modeling, such as various Generative Adversarial Networks (GAN) -models, has been applied to generate paintings and music. Research in Natural -Language Processing (NLP) also had a leap forward in 2018 since the release of -the pre-trained contextual neural language models such as BERT and recently -released GPT3. Despite the exciting AI applications aforementioned, AI is still -significantly lagging behind humans in creativity, which is often considered -the ultimate moonshot for AI. Our work is inspired by Chinese calligraphy, -which is a unique form of visual art where the character itself is an aesthetic -painting. We also draw inspirations from paintings of the Abstract -Expressionist movement in the 1940s and 1950s, such as the work by American -painter Franz Kline. In this paper, we present a creative framework based on -Conditional Generative Adversarial Networks and Contextual Neural Language -Model to generate abstract artworks that have intrinsic meaning and aesthetic -value, which is different from the existing work, such as image captioning and -text-to-image generation, where the texts are the descriptions of the images. -In addition, we have publicly released a Chinese calligraphy image dataset and -demonstrate our framework using a prototype system and a user study. - ---------------- - -### 06 Jul 2018 | [Understanding Humans in Crowded Scenes: Deep Nested Adversarial Learning and A New Benchmark for Multi-Human Parsing](https://arxiv.org/abs/1804.03287) | [⬇️](https://arxiv.org/pdf/1804.03287) -*Jian Zhao, Jianshu Li, Yu Cheng, Li Zhou, Terence Sim, Shuicheng Yan, Jiashi Feng* - - Despite the noticeable progress in perceptual tasks like detection, instance -segmentation and human parsing, computers still perform unsatisfactorily on -visually understanding humans in crowded scenes, such as group behavior -analysis, person re-identification and autonomous driving, etc. To this end, -models need to comprehensively perceive the semantic information and the -differences between instances in a multi-human image, which is recently defined -as the multi-human parsing task. In this paper, we present a new large-scale -database "Multi-Human Parsing (MHP)" for algorithm development and evaluation, -and advances the state-of-the-art in understanding humans in crowded scenes. -MHP contains 25,403 elaborately annotated images with 58 fine-grained semantic -category labels, involving 2-26 persons per image and captured in real-world -scenes from various viewpoints, poses, occlusion, interactions and background. -We further propose a novel deep Nested Adversarial Network (NAN) model for -multi-human parsing. NAN consists of three Generative Adversarial Network -(GAN)-like sub-nets, respectively performing semantic saliency prediction, -instance-agnostic parsing and instance-aware clustering. These sub-nets form a -nested structure and are carefully designed to learn jointly in an end-to-end -way. NAN consistently outperforms existing state-of-the-art solutions on our -MHP and several other datasets, and serves as a strong baseline to drive the -future research for multi-human parsing. - ---------------- - -### 10 May 2021 | [Robust Training Using Natural Transformation](https://arxiv.org/abs/2105.04070) | [⬇️](https://arxiv.org/pdf/2105.04070) -*Shuo Wang, Lingjuan Lyu, Surya Nepal, Carsten Rudolph, Marthie Grobler, Kristen Moore* - - Previous robustness approaches for deep learning models such as data -augmentation techniques via data transformation or adversarial training cannot -capture real-world variations that preserve the semantics of the input, such as -a change in lighting conditions. To bridge this gap, we present NaTra, an -adversarial training scheme that is designed to improve the robustness of image -classification algorithms. We target attributes of the input images that are -independent of the class identification, and manipulate those attributes to -mimic real-world natural transformations (NaTra) of the inputs, which are then -used to augment the training dataset of the image classifier. Specifically, we -apply \textit{Batch Inverse Encoding and Shifting} to map a batch of given -images to corresponding disentangled latent codes of well-trained generative -models. \textit{Latent Codes Expansion} is used to boost image reconstruction -quality through the incorporation of extended feature maps. -\textit{Unsupervised Attribute Directing and Manipulation} enables -identification of the latent directions that correspond to specific attribute -changes, and then produce interpretable manipulations of those attributes, -thereby generating natural transformations to the input data. We demonstrate -the efficacy of our scheme by utilizing the disentangled latent representations -derived from well-trained GANs to mimic transformations of an image that are -similar to real-world natural variations (such as lighting conditions or -hairstyle), and train models to be invariant to these natural transformations. -Extensive experiments show that our method improves generalization of -classification models and increases its robustness to various real-world -distortions - ---------------- - -### 02 Sep 2020 | [Unified Generative Adversarial Networks for Controllable Image-to-Image Translation](https://arxiv.org/abs/1912.06112) | [⬇️](https://arxiv.org/pdf/1912.06112) -*Hao Tang, Hong Liu, Nicu Sebe* - - We propose a unified Generative Adversarial Network (GAN) for controllable -image-to-image translation, i.e., transferring an image from a source to a -target domain guided by controllable structures. In addition to conditioning on -a reference image, we show how the model can generate images conditioned on -controllable structures, e.g., class labels, object keypoints, human skeletons, -and scene semantic maps. The proposed model consists of a single generator and -a discriminator taking a conditional image and the target controllable -structure as input. In this way, the conditional image can provide appearance -information and the controllable structure can provide the structure -information for generating the target result. Moreover, our model learns the -image-to-image mapping through three novel losses, i.e., color loss, -controllable structure guided cycle-consistency loss, and controllable -structure guided self-content preserving loss. Also, we present the Fr\'echet -ResNet Distance (FRD) to evaluate the quality of the generated images. -Experiments on two challenging image translation tasks, i.e., hand -gesture-to-gesture translation and cross-view image translation, show that our -model generates convincing results, and significantly outperforms other -state-of-the-art methods on both tasks. Meanwhile, the proposed framework is a -unified solution, thus it can be applied to solving other controllable -structure guided image translation tasks such as landmark guided facial -expression translation and keypoint guided person image generation. To the best -of our knowledge, we are the first to make one GAN framework work on all such -controllable structure guided image translation tasks. Code is available at -https://github.com/Ha0Tang/GestureGAN. - ---------------- - -### 07 Jul 2021 | [Controlled Caption Generation for Images Through Adversarial Attacks](https://arxiv.org/abs/2107.03050) | [⬇️](https://arxiv.org/pdf/2107.03050) -*Nayyer Aafaq, Naveed Akhtar, Wei Liu, Mubarak Shah and Ajmal Mian* - - Deep learning is found to be vulnerable to adversarial examples. However, its -adversarial susceptibility in image caption generation is under-explored. We -study adversarial examples for vision and language models, which typically -adopt an encoder-decoder framework consisting of two major components: a -Convolutional Neural Network (i.e., CNN) for image feature extraction and a -Recurrent Neural Network (RNN) for caption generation. In particular, we -investigate attacks on the visual encoder's hidden layer that is fed to the -subsequent recurrent network. The existing methods either attack the -classification layer of the visual encoder or they back-propagate the gradients -from the language model. In contrast, we propose a GAN-based algorithm for -crafting adversarial examples for neural image captioning that mimics the -internal representation of the CNN such that the resulting deep features of the -input image enable a controlled incorrect caption generation through the -recurrent network. Our contribution provides new insights for understanding -adversarial attacks on vision systems with language component. The proposed -method employs two strategies for a comprehensive evaluation. The first -examines if a neural image captioning system can be misled to output targeted -image captions. The second analyzes the possibility of keywords into the -predicted captions. Experiments show that our algorithm can craft effective -adversarial images based on the CNN hidden layers to fool captioning framework. -Moreover, we discover the proposed attack to be highly transferable. Our work -leads to new robustness implications for neural image captioning. - ---------------- - -### 26 Mar 2021 | [Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis](https://arxiv.org/abs/2003.11571) | [⬇️](https://arxiv.org/pdf/2003.11571) -*Wei Sun and Tianfu Wu* - - With the remarkable recent progress on learning deep generative models, it -becomes increasingly interesting to develop models for controllable image -synthesis from reconfigurable inputs. This paper focuses on a recent emerged -task, layout-to-image, to learn generative models that are capable of -synthesizing photo-realistic images from spatial layout (i.e., object bounding -boxes configured in an image lattice) and style (i.e., structural and -appearance variations encoded by latent vectors). This paper first proposes an -intuitive paradigm for the task, layout-to-mask-to-image, to learn to unfold -object masks of given bounding boxes in an input layout to bridge the gap -between the input layout and synthesized images. Then, this paper presents a -method built on Generative Adversarial Networks for the proposed -layout-to-mask-to-image with style control at both image and mask levels. -Object masks are learned from the input layout and iteratively refined along -stages in the generator network. Style control at the image level is the same -as in vanilla GANs, while style control at the object mask level is realized by -a proposed novel feature normalization scheme, Instance-Sensitive and -Layout-Aware Normalization. In experiments, the proposed method is tested in -the COCO-Stuff dataset and the Visual Genome dataset with state-of-the-art -performance obtained. - ---------------- - -### 17 May 2021 | [Prototype-supervised Adversarial Network for Targeted Attack of Deep Hashing](https://arxiv.org/abs/2105.07553) | [⬇️](https://arxiv.org/pdf/2105.07553) -*Xunguang Wang, Zheng Zhang, Baoyuan Wu, Fumin Shen, Guangming Lu* - - Due to its powerful capability of representation learning and high-efficiency -computation, deep hashing has made significant progress in large-scale image -retrieval. However, deep hashing networks are vulnerable to adversarial -examples, which is a practical secure problem but seldom studied in -hashing-based retrieval field. In this paper, we propose a novel -prototype-supervised adversarial network (ProS-GAN), which formulates a -flexible generative architecture for efficient and effective targeted hashing -attack. To the best of our knowledge, this is the first generation-based method -to attack deep hashing networks. Generally, our proposed framework consists of -three parts, i.e., a PrototypeNet, a generator, and a discriminator. -Specifically, the designed PrototypeNet embeds the target label into the -semantic representation and learns the prototype code as the category-level -representative of the target label. Moreover, the semantic representation and -the original image are jointly fed into the generator for a flexible targeted -attack. Particularly, the prototype code is adopted to supervise the generator -to construct the targeted adversarial example by minimizing the Hamming -distance between the hash code of the adversarial example and the prototype -code. Furthermore, the generator is against the discriminator to simultaneously -encourage the adversarial examples visually realistic and the semantic -representation informative. Extensive experiments verify that the proposed -framework can efficiently produce adversarial examples with better targeted -attack performance and transferability over state-of-the-art targeted attack -methods of deep hashing. The related codes could be available at -https://github.com/xunguangwang/ProS-GAN . - ---------------- - - - - -# 7 - - - -1. 🔑 Conditional Computation and Routing Strategies - - 📚 Papers explore efficient routing mechanisms to selectively activate expert models - - 💡 Example: Telemed AI system routes patients to specialized expert models based on symptoms - -2. 🧠 Capacity and Scalability Improvements - - 📗 Techniques enable training and deploying larger, more accurate expert models - - 💻 Example: AI developer can scale up expert models for specific medical domains - -3. 🤝 Ensemble Learning and Knowledge Distillation - - 📕 Combining predictions from multiple expert models for improved performance - - ⚕️ Example: Ensemble of expert models for comprehensive medical diagnosis - -### Keywords and Glossary - -1. 🔑 Routing Strategies -2. 🧠 Model Capacity -3. 📊 Scalability -4. 🤝 Ensemble Learning -5. 🔎 Knowledge Distillation -6. ⚡ Conditional Computation - -```mermaid -graph LR - A[Routing Strategies] --> B[Model Capacity] - B --> C[Scalability] - C --> D[Ensemble Learning] - D --> E[Knowledge Distillation] - E --> F[Conditional Computation] - F --> A -``` - -```python -# app.py -import streamlit as st - -st.title("Mixture of Experts (MoE) Glossary") - -st.header("1. Routing Strategies") -st.write("Mechanisms for selectively activating expert models based on input data.") - -st.header("2. Model Capacity") -st.write("Ability to train and deploy larger, more accurate expert models.") - -st.header("3. Scalability") -st.write("Techniques for scaling up expert models to handle larger datasets or more complex tasks.") - -st.header("4. Ensemble Learning") -st.write("Combining predictions from multiple expert models to improve overall performance.") - -st.header("5. Knowledge Distillation") -st.write("Transferring knowledge from larger expert models to smaller, more efficient models.") - -st.header("6. Conditional Computation") -st.write("Selectively computing only the necessary components of a model based on the input data.") -``` - - - -# 🩺🔍 Search Results -### 05 Jun 2023 | [COMET: Learning Cardinality Constrained Mixture of Experts with Trees and Local Search](https://arxiv.org/abs/2306.02824) | [⬇️](https://arxiv.org/pdf/2306.02824) -*Shibal Ibrahim, Wenyu Chen, Hussein Hazimeh, Natalia Ponomareva, Zhe Zhao, Rahul Mazumder* - - The sparse Mixture-of-Experts (Sparse-MoE) framework efficiently scales up -model capacity in various domains, such as natural language processing and -vision. Sparse-MoEs select a subset of the "experts" (thus, only a portion of -the overall network) for each input sample using a sparse, trainable gate. -Existing sparse gates are prone to convergence and performance issues when -training with first-order optimization methods. In this paper, we introduce two -improvements to current MoE approaches. First, we propose a new sparse gate: -COMET, which relies on a novel tree-based mechanism. COMET is differentiable, -can exploit sparsity to speed up computation, and outperforms state-of-the-art -gates. Second, due to the challenging combinatorial nature of sparse expert -selection, first-order methods are typically prone to low-quality solutions. To -deal with this challenge, we propose a novel, permutation-based local search -method that can complement first-order methods in training any sparse gate, -e.g., Hash routing, Top-k, DSelect-k, and COMET. We show that local search can -help networks escape bad initializations or solutions. We performed large-scale -experiments on various domains, including recommender systems, vision, and -natural language processing. On standard vision and recommender systems -benchmarks, COMET+ (COMET with local search) achieves up to 13% improvement in -ROC AUC over popular gates, e.g., Hash routing and Top-k, and up to 9% over -prior differentiable gates e.g., DSelect-k. When Top-k and Hash gates are -combined with local search, we see up to $100\times$ reduction in the budget -needed for hyperparameter tuning. Moreover, for language modeling, our approach -improves over the state-of-the-art MoEBERT model for distilling BERT on 5/7 -GLUE benchmarks as well as SQuAD dataset. - ---------------- - -### 04 Jan 2024 | [Text2MDT: Extracting Medical Decision Trees from Medical Texts](https://arxiv.org/abs/2401.02034) | [⬇️](https://arxiv.org/pdf/2401.02034) -*Wei Zhu and Wenfeng Li and Xing Tian and Pengfei Wang and Xiaoling Wang and Jin Chen and Yuanbin Wu and Yuan Ni and Guotong Xie* - - Knowledge of the medical decision process, which can be modeled as medical -decision trees (MDTs), is critical to build clinical decision support systems. -However, the current MDT construction methods rely heavily on time-consuming -and laborious manual annotation. In this work, we propose a novel task, -Text2MDT, to explore the automatic extraction of MDTs from medical texts such -as medical guidelines and textbooks. We normalize the form of the MDT and -create an annotated Text-to-MDT dataset in Chinese with the participation of -medical experts. We investigate two different methods for the Text2MDT tasks: -(a) an end-to-end framework which only relies on a GPT style large language -models (LLM) instruction tuning to generate all the node information and tree -structures. (b) The pipeline framework which decomposes the Text2MDT task to -three subtasks. Experiments on our Text2MDT dataset demonstrate that: (a) the -end-to-end method basd on LLMs (7B parameters or larger) show promising -results, and successfully outperform the pipeline methods. (b) The -chain-of-thought (COT) prompting method \cite{Wei2022ChainOT} can improve the -performance of the fine-tuned LLMs on the Text2MDT test set. (c) the -lightweight pipelined method based on encoder-based pretrained models can -perform comparably with LLMs with model complexity two magnititudes smaller. -Our Text2MDT dataset is open-sourced at -\url{https://tianchi.aliyun.com/dataset/95414}, and the source codes are -open-sourced at \url{https://github.com/michael-wzhu/text2dt}. - ---------------- - -### 16 Mar 2021 | [Automatic Intent-Slot Induction for Dialogue Systems](https://arxiv.org/abs/2103.08886) | [⬇️](https://arxiv.org/pdf/2103.08886) -*Zengfeng Zeng, Dan Ma, Haiqin Yang, Zhen Gou and Jianping Shen* - - Automatically and accurately identifying user intents and filling the -associated slots from their spoken language are critical to the success of -dialogue systems. Traditional methods require manually defining the -DOMAIN-INTENT-SLOT schema and asking many domain experts to annotate the -corresponding utterances, upon which neural models are trained. This procedure -brings the challenges of information sharing hindering, out-of-schema, or data -sparsity in open-domain dialogue systems. To tackle these challenges, we -explore a new task of {\em automatic intent-slot induction} and propose a novel -domain-independent tool. That is, we design a coarse-to-fine three-step -procedure including Role-labeling, Concept-mining, And Pattern-mining (RCAP): -(1) role-labeling: extracting keyphrases from users' utterances and classifying -them into a quadruple of coarsely-defined intent-roles via sequence labeling; -(2) concept-mining: clustering the extracted intent-role mentions and naming -them into abstract fine-grained concepts; (3) pattern-mining: applying the -Apriori algorithm to mine intent-role patterns and automatically inferring the -intent-slot using these coarse-grained intent-role labels and fine-grained -concepts. Empirical evaluations on both real-world in-domain and out-of-domain -datasets show that: (1) our RCAP can generate satisfactory SLU schema and -outperforms the state-of-the-art supervised learning method; (2) our RCAP can -be directly applied to out-of-domain datasets and gain at least 76\% -improvement of F1-score on intent detection and 41\% improvement of F1-score on -slot filling; (3) our RCAP exhibits its power in generic intent-slot -extractions with less manual effort, which opens pathways for schema induction -on new domains and unseen intent-slot discovery for generalizable dialogue -systems. - ---------------- - -### 23 Aug 2023 | [Tryage: Real-time, intelligent Routing of User Prompts to Large Language Models](https://arxiv.org/abs/2308.11601) | [⬇️](https://arxiv.org/pdf/2308.11601) -*Surya Narayanan Hari, Matt Thomson* - - The introduction of the transformer architecture and the self-attention -mechanism has led to an explosive production of language models trained on -specific downstream tasks and data domains. With over 200, 000 models in the -Hugging Face ecosystem, users grapple with selecting and optimizing models to -suit multifaceted workflows and data domains while addressing computational, -security, and recency concerns. There is an urgent need for machine learning -frameworks that can eliminate the burden of model selection and customization -and unleash the incredible power of the vast emerging model library for end -users. Here, we propose a context-aware routing system, Tryage, that leverages -a language model router for optimal selection of expert models from a model -library based on analysis of individual input prompts. Inspired by the thalamic -router in the brain, Tryage employs a perceptive router to predict down-stream -model performance on prompts and, then, makes a routing decision using an -objective function that integrates performance predictions with user goals and -constraints that are incorporated through flags (e.g., model size, model -recency). Tryage allows users to explore a Pareto front and automatically -trade-off between task accuracy and secondary goals including minimization of -model size, recency, security, verbosity, and readability. Across heterogeneous -data sets that include code, text, clinical data, and patents, the Tryage -framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection -identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by -GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how -routing models can be applied to program and control the behavior of -multi-model LLM systems to maximize efficient use of the expanding and evolving -language model ecosystem. - ---------------- - -### 22 Dec 2023 | [Structured prompt interrogation and recursive extraction of semantics (SPIRES): A method for populating knowledge bases using zero-shot learning](https://arxiv.org/abs/2304.02711) | [⬇️](https://arxiv.org/pdf/2304.02711) -*J. Harry Caufield, Harshad Hegde, Vincent Emonet, Nomi L. Harris, Marcin P. Joachimiak, Nicolas Matentzoglu, HyeongSik Kim, Sierra A.T. Moxon, Justin T. Reese, Melissa A. Haendel, Peter N. Robinson, and Christopher J. Mungall* - - Creating knowledge bases and ontologies is a time consuming task that relies -on a manual curation. AI/NLP approaches can assist expert curators in -populating these knowledge bases, but current approaches rely on extensive -training data, and are not able to populate arbitrary complex nested knowledge -schemas. - Here we present Structured Prompt Interrogation and Recursive Extraction of -Semantics (SPIRES), a Knowledge Extraction approach that relies on the ability -of Large Language Models (LLMs) to perform zero-shot learning (ZSL) and -general-purpose query answering from flexible prompts and return information -conforming to a specified schema. Given a detailed, user-defined knowledge -schema and an input text, SPIRES recursively performs prompt interrogation -against GPT-3+ to obtain a set of responses matching the provided schema. -SPIRES uses existing ontologies and vocabularies to provide identifiers for all -matched elements. - We present examples of use of SPIRES in different domains, including -extraction of food recipes, multi-species cellular signaling pathways, disease -treatments, multi-step drug mechanisms, and chemical to disease causation -graphs. Current SPIRES accuracy is comparable to the mid-range of existing -Relation Extraction (RE) methods, but has the advantage of easy customization, -flexibility, and, crucially, the ability to perform new tasks in the absence of -any training data. This method supports a general strategy of leveraging the -language interpreting capabilities of LLMs to assemble knowledge bases, -assisting manual knowledge curation and acquisition while supporting validation -with publicly-available databases and ontologies external to the LLM. - SPIRES is available as part of the open source OntoGPT package: -https://github.com/ monarch-initiative/ontogpt. - ---------------- - -### 28 Jan 2024 | [Contrastive Learning and Mixture of Experts Enables Precise Vector Embeddings](https://arxiv.org/abs/2401.15713) | [⬇️](https://arxiv.org/pdf/2401.15713) -*Rohan Kapur, Logan Hallee, Arjun Patel, Bohdan Khomtchouk* - - The advancement of transformer neural networks has significantly elevated the -capabilities of sentence similarity models, particularly in creating effective -vector representations of natural language inputs. However, these models face -notable challenges in domain-specific contexts, especially in highly -specialized scientific sub-fields. Traditional methods often struggle in this -regime, either overgeneralizing similarities within a niche or being overly -sensitive to minor differences, resulting in inaccurate text classification and -subpar vector representation. In an era where retrieval augmentation and search -are increasingly crucial, precise and concise numerical representations are -essential. In this paper, we target this issue by assembling niche datasets -using co-citations as a similarity metric, focusing on biomedical domains. We -employ two key strategies for fine-tuning state-of-the-art models: 1. -Domain-specific Fine-Tuning, which tailors pretrained models to a single -domain, and 2. Universal Applicability with Mixture of Experts (MoE), adapting -pretrained models with enforced routing for multiple domains simultaneously. -Our training approach emphasizes the use of abstracts for faster training, -incorporating Multiple Negative Rankings loss for efficient contrastive -learning. Notably, our MoE variants, equipped with $N$ experts, achieve the -efficacy of $N$ individual models, heralding a new era of versatile, -One-Size-Fits-All transformer networks for various tasks. This methodology -marks significant advancements in scientific text classification metrics and -holds promise for enhancing vector database search and compilation. - ---------------- - -### 20 Mar 2022 | [Build a Robust QA System with Transformer-based Mixture of Experts](https://arxiv.org/abs/2204.09598) | [⬇️](https://arxiv.org/pdf/2204.09598) -*Yu Qing Zhou, Xixuan Julie Liu, Yuanzhe Dong* - - In this paper, we aim to build a robust question answering system that can -adapt to out-of-domain datasets. A single network may overfit to the -superficial correlation in the training distribution, but with a meaningful -number of expert sub-networks, a gating network that selects a sparse -combination of experts for each input, and careful balance on the importance of -expert sub-networks, the Mixture-of-Experts (MoE) model allows us to train a -multi-task learner that can be generalized to out-of-domain datasets. We also -explore the possibility of bringing the MoE layers up to the middle of the -DistilBERT and replacing the dense feed-forward network with a -sparsely-activated switch FFN layers, similar to the Switch Transformer -architecture, which simplifies the MoE routing algorithm with reduced -communication and computational costs. In addition to model architectures, we -explore techniques of data augmentation including Easy Data Augmentation (EDA) -and back translation, to create more meaningful variance among the small -out-of-domain training data, therefore boosting the performance and robustness -of our models. In this paper, we show that our combination of best architecture -and data augmentation techniques achieves a 53.477 F1 score in the -out-of-domain evaluation, which is a 9.52% performance gain over the baseline. -On the final test set, we reported a higher 59.506 F1 and 41.651 EM. We -successfully demonstrate the effectiveness of Mixture-of-Expert architecture in -a Robust QA task. - ---------------- - -### 22 Sep 2023 | [Large Language Models and Control Mechanisms Improve Text Readability of Biomedical Abstracts](https://arxiv.org/abs/2309.13202) | [⬇️](https://arxiv.org/pdf/2309.13202) -*Zihao Li, Samuel Belkadi, Nicolo Micheletti, Lifeng Han, Matthew Shardlow, Goran Nenadic* - - Biomedical literature often uses complex language and inaccessible -professional terminologies. That is why simplification plays an important role -in improving public health literacy. Applying Natural Language Processing (NLP) -models to automate such tasks allows for quick and direct accessibility for lay -readers. In this work, we investigate the ability of state-of-the-art large -language models (LLMs) on the task of biomedical abstract simplification, using -the publicly available dataset for plain language adaptation of biomedical -abstracts (\textbf{PLABA}). The methods applied include domain fine-tuning and -prompt-based learning (PBL) on: 1) Encoder-decoder models (T5, SciFive, and -BART), 2) Decoder-only GPT models (GPT-3.5 and GPT-4) from OpenAI and BioGPT, -and 3) Control-token mechanisms on BART-based models. We used a range of -automatic evaluation metrics, including BLEU, ROUGE, SARI, and BERTscore, and -also conducted human evaluations. BART-Large with Control Token (BART-L-w-CT) -mechanisms reported the highest SARI score of 46.54 and T5-base reported the -highest BERTscore 72.62. In human evaluation, BART-L-w-CTs achieved a better -simplicity score over T5-Base (2.9 vs. 2.2), while T5-Base achieved a better -meaning preservation score over BART-L-w-CTs (3.1 vs. 2.6). We also categorised -the system outputs with examples, hoping this will shed some light for future -research on this task. Our code, fine-tuned models, and data splits are -available at \url{https://github.com/HECTA-UoM/PLABA-MU} - ---------------- - -### 23 Jun 2023 | [Efficient Online Processing with Deep Neural Networks](https://arxiv.org/abs/2306.13474) | [⬇️](https://arxiv.org/pdf/2306.13474) -*Lukas Hedegaard* - - The capabilities and adoption of deep neural networks (DNNs) grow at an -exhilarating pace: Vision models accurately classify human actions in videos -and identify cancerous tissue in medical scans as precisely than human experts; -large language models answer wide-ranging questions, generate code, and write -prose, becoming the topic of everyday dinner-table conversations. Even though -their uses are exhilarating, the continually increasing model sizes and -computational complexities have a dark side. The economic cost and negative -environmental externalities of training and serving models is in evident -disharmony with financial viability and climate action goals. - Instead of pursuing yet another increase in predictive performance, this -dissertation is dedicated to the improvement of neural network efficiency. -Specifically, a core contribution addresses the efficiency aspects during -online inference. Here, the concept of Continual Inference Networks (CINs) is -proposed and explored across four publications. CINs extend prior -state-of-the-art methods developed for offline processing of spatio-temporal -data and reuse their pre-trained weights, improving their online processing -efficiency by an order of magnitude. These advances are attained through a -bottom-up computational reorganization and judicious architectural -modifications. The benefit to online inference is demonstrated by reformulating -several widely used network architectures into CINs, including 3D CNNs, -ST-GCNs, and Transformer Encoders. An orthogonal contribution tackles the -concurrent adaptation and computational acceleration of a large source model -into multiple lightweight derived models. Drawing on fusible adapter networks -and structured pruning, Structured Pruning Adapters achieve superior predictive -accuracy under aggressive pruning using significantly fewer learned weights -compared to fine-tuning with pruning. - ---------------- - -### 31 Mar 2023 | [Pair Programming with Large Language Models for Sampling and Estimation of Copulas](https://arxiv.org/abs/2303.18116) | [⬇️](https://arxiv.org/pdf/2303.18116) -*Jan G\'orecki* - - Without writing a single line of code by a human, an example Monte Carlo -simulation based application for stochastic dependence modeling with copulas is -developed using a state-of-the-art large language model (LLM) fine-tuned for -conversations. This includes interaction with ChatGPT in natural language and -using mathematical formalism, which, under careful supervision by a -human-expert, led to producing a working code in MATLAB, Python and R for -sampling from a given copula model, evaluation of the model's density, -performing maximum likelihood estimation, optimizing the code for parallel -computing for CPUs as well as for GPUs, and visualization of the computed -results. In contrast to other emerging studies that assess the accuracy of LLMs -like ChatGPT on tasks from a selected area, this work rather investigates ways -how to achieve a successful solution of a standard statistical task in a -collaboration of a human-expert and artificial intelligence (AI). Particularly, -through careful prompt engineering, we separate successful solutions generated -by ChatGPT from unsuccessful ones, resulting in a comprehensive list of related -pros and cons. It is demonstrated that if the typical pitfalls are avoided, we -can substantially benefit from collaborating with an AI partner. For example, -we show that if ChatGPT is not able to provide a correct solution due to a lack -of or incorrect knowledge, the human-expert can feed it with the correct -knowledge, e.g., in the form of mathematical theorems and formulas, and make it -to apply the gained knowledge in order to provide a solution that is correct. -Such ability presents an attractive opportunity to achieve a programmed -solution even for users with rather limited knowledge of programming -techniques. - ---------------- - -### 13 Oct 2021 | [EXplainable Neural-Symbolic Learning (X-NeSyL) methodology to fuse deep learning representations with expert knowledge graphs: the MonuMAI cultural heritage use case](https://arxiv.org/abs/2104.11914) | [⬇️](https://arxiv.org/pdf/2104.11914) -*Natalia D\'iaz-Rodr\'iguez, Alberto Lamas, Jules Sanchez, Gianni Franchi, Ivan Donadello, Siham Tabik, David Filliat, Policarpo Cruz, Rosana Montes, Francisco Herrera* - - The latest Deep Learning (DL) models for detection and classification have -achieved an unprecedented performance over classical machine learning -algorithms. However, DL models are black-box methods hard to debug, interpret, -and certify. DL alone cannot provide explanations that can be validated by a -non technical audience. In contrast, symbolic AI systems that convert concepts -into rules or symbols -- such as knowledge graphs -- are easier to explain. -However, they present lower generalisation and scaling capabilities. A very -important challenge is to fuse DL representations with expert knowledge. One -way to address this challenge, as well as the performance-explainability -trade-off is by leveraging the best of both streams without obviating domain -expert knowledge. We tackle such problem by considering the symbolic knowledge -is expressed in form of a domain expert knowledge graph. We present the -eXplainable Neural-symbolic learning (X-NeSyL) methodology, designed to learn -both symbolic and deep representations, together with an explainability metric -to assess the level of alignment of machine and human expert explanations. The -ultimate objective is to fuse DL representations with expert domain knowledge -during the learning process to serve as a sound basis for explainability. -X-NeSyL methodology involves the concrete use of two notions of explanation at -inference and training time respectively: 1) EXPLANet: Expert-aligned -eXplainable Part-based cLAssifier NETwork Architecture, a compositional CNN -that makes use of symbolic representations, and 2) SHAP-Backprop, an -explainable AI-informed training procedure that guides the DL process to align -with such symbolic representations in form of knowledge graphs. We showcase -X-NeSyL methodology using MonuMAI dataset for monument facade image -classification, and demonstrate that our approach improves explainability and -performance. - ---------------- - -### 19 Aug 2023 | [A Unified Continual Learning Framework with General Parameter-Efficient Tuning](https://arxiv.org/abs/2303.10070) | [⬇️](https://arxiv.org/pdf/2303.10070) -*Qiankun Gao, Chen Zhao, Yifan Sun, Teng Xi, Gang Zhang, Bernard Ghanem, Jian Zhang* - - The "pre-training $\rightarrow$ downstream adaptation" presents both new -opportunities and challenges for Continual Learning (CL). Although the recent -state-of-the-art in CL is achieved through Parameter-Efficient-Tuning (PET) -adaptation paradigm, only prompt has been explored, limiting its application to -Transformers only. In this paper, we position prompting as one instantiation of -PET, and propose a unified CL framework with general PET, dubbed as -Learning-Accumulation-Ensemble (LAE). PET, e.g., using Adapter, LoRA, or -Prefix, can adapt a pre-trained model to downstream tasks with fewer parameters -and resources. Given a PET method, our LAE framework incorporates it for CL -with three novel designs. 1) Learning: the pre-trained model adapts to the new -task by tuning an online PET module, along with our adaptation speed -calibration to align different PET modules, 2) Accumulation: the task-specific -knowledge learned by the online PET module is accumulated into an offline PET -module through momentum update, 3) Ensemble: During inference, we respectively -construct two experts with online/offline PET modules (which are favored by the -novel/historical tasks) for prediction ensemble. We show that LAE is compatible -with a battery of PET methods and gains strong CL capability. For example, LAE -with Adaptor PET surpasses the prior state-of-the-art by 1.3% and 3.6% in -last-incremental accuracy on CIFAR100 and ImageNet-R datasets, respectively. -Code is available at \url{https://github.com/gqk/LAE}. - ---------------- - -### 03 Jul 2017 | [Active Self-Paced Learning for Cost-Effective and Progressive Face Identification](https://arxiv.org/abs/1701.03555) | [⬇️](https://arxiv.org/pdf/1701.03555) -*Liang Lin and Keze Wang and Deyu Meng and Wangmeng Zuo and Lei Zhang* - - This paper aims to develop a novel cost-effective framework for face -identification, which progressively maintains a batch of classifiers with the -increasing face images of different individuals. By naturally combining two -recently rising techniques: active learning (AL) and self-paced learning (SPL), -our framework is capable of automatically annotating new instances and -incorporating them into training under weak expert re-certification. We first -initialize the classifier using a few annotated samples for each individual, -and extract image features using the convolutional neural nets. Then, a number -of candidates are selected from the unannotated samples for classifier -updating, in which we apply the current classifiers ranking the samples by the -prediction confidence. In particular, our approach utilizes the high-confidence -and low-confidence samples in the self-paced and the active user-query way, -respectively. The neural nets are later fine-tuned based on the updated -classifiers. Such heuristic implementation is formulated as solving a concise -active SPL optimization problem, which also advances the SPL development by -supplementing a rational dynamic curriculum constraint. The new model finely -accords with the "instructor-student-collaborative" learning mode in human -education. The advantages of this proposed framework are two-folds: i) The -required number of annotated samples is significantly decreased while the -comparable performance is guaranteed. A dramatic reduction of user effort is -also achieved over other state-of-the-art active learning techniques. ii) The -mixture of SPL and AL effectively improves not only the classifier accuracy -compared to existing AL/SPL methods but also the robustness against noisy data. -We evaluate our framework on two challenging datasets, and demonstrate very -promising results. (http://hcp.sysu.edu.cn/projects/aspl/) - ---------------- - -### 02 Jun 2023 | [Multi-Level Knowledge Distillation for Out-of-Distribution Detection in Text](https://arxiv.org/abs/2211.11300) | [⬇️](https://arxiv.org/pdf/2211.11300) -*Qianhui Wu, Huiqiang Jiang, Haonan Yin, B\"orje F. Karlsson, Chin-Yew Lin* - - Self-supervised representation learning has proved to be a valuable component -for out-of-distribution (OoD) detection with only the texts of in-distribution -(ID) examples. These approaches either train a language model from scratch or -fine-tune a pre-trained language model using ID examples, and then take the -perplexity output by the language model as OoD scores. In this paper, we -analyze the complementary characteristics of both OoD detection methods and -propose a multi-level knowledge distillation approach that integrates their -strengths while mitigating their limitations. Specifically, we use a fine-tuned -model as the teacher to teach a randomly initialized student model on the ID -examples. Besides the prediction layer distillation, we present a -similarity-based intermediate layer distillation method to thoroughly explore -the representation space of the teacher model. In this way, the learned student -can better represent the ID data manifold while gaining a stronger ability to -map OoD examples outside the ID data manifold with the regularization inherited -from pre-training. Besides, the student model sees only ID examples during -parameter learning, further promoting more distinguishable features for OoD -detection. We conduct extensive experiments over multiple benchmark datasets, -i.e., CLINC150, SST, ROSTD, 20 NewsGroups, and AG News; showing that the -proposed method yields new state-of-the-art performance. We also explore its -application as an AIGC detector to distinguish between answers generated by -ChatGPT and human experts. It is observed that our model exceeds human -evaluators in the pair-expert task on the Human ChatGPT Comparison Corpus. - ---------------- - -### 07 Jul 2023 | [Distilling BlackBox to Interpretable models for Efficient Transfer Learning](https://arxiv.org/abs/2305.17303) | [⬇️](https://arxiv.org/pdf/2305.17303) -*Shantanu Ghosh, Ke Yu, Kayhan Batmanghelich* - - Building generalizable AI models is one of the primary challenges in the -healthcare domain. While radiologists rely on generalizable descriptive rules -of abnormality, Neural Network (NN) models suffer even with a slight shift in -input distribution (e.g., scanner type). Fine-tuning a model to transfer -knowledge from one domain to another requires a significant amount of labeled -data in the target domain. In this paper, we develop an interpretable model -that can be efficiently fine-tuned to an unseen target domain with minimal -computational cost. We assume the interpretable component of NN to be -approximately domain-invariant. However, interpretable models typically -underperform compared to their Blackbox (BB) variants. We start with a BB in -the source domain and distill it into a \emph{mixture} of shallow interpretable -models using human-understandable concepts. As each interpretable model covers -a subset of data, a mixture of interpretable models achieves comparable -performance as BB. Further, we use the pseudo-labeling technique from -semi-supervised learning (SSL) to learn the concept classifier in the target -domain, followed by fine-tuning the interpretable models in the target domain. -We evaluate our model using a real-life large-scale chest-X-ray (CXR) -classification dataset. The code is available at: -\url{https://github.com/batmanlab/MICCAI-2023-Route-interpret-repeat-CXRs}. - ---------------- - -### 19 Jun 2023 | [JiuZhang 2.0: A Unified Chinese Pre-trained Language Model for Multi-task Mathematical Problem Solving](https://arxiv.org/abs/2306.11027) | [⬇️](https://arxiv.org/pdf/2306.11027) -*Wayne Xin Zhao, Kun Zhou, Beichen Zhang, Zheng Gong, Zhipeng Chen, Yuanhang Zhou, Ji-Rong Wen, Jing Sha, Shijin Wang, Cong Liu, Guoping Hu* - - Although pre-trained language models~(PLMs) have recently advanced the -research progress in mathematical reasoning, they are not specially designed as -a capable multi-task solver, suffering from high cost for multi-task deployment -(\eg a model copy for a task) and inferior performance on complex mathematical -problems in practical applications. To address these issues, in this paper, we -propose \textbf{JiuZhang~2.0}, a unified Chinese PLM specially for multi-task -mathematical problem solving. Our idea is to maintain a moderate-sized model -and employ the \emph{cross-task knowledge sharing} to improve the model -capacity in a multi-task setting. Specially, we construct a -Mixture-of-Experts~(MoE) architecture for modeling mathematical text, so as to -capture the common mathematical knowledge across tasks. For optimizing the MoE -architecture, we design \emph{multi-task continual pre-training} and -\emph{multi-task fine-tuning} strategies for multi-task adaptation. These -training strategies can effectively decompose the knowledge from the task data -and establish the cross-task sharing via expert networks. In order to further -improve the general capacity of solving different complex tasks, we leverage -large language models~(LLMs) as complementary models to iteratively refine the -generated solution by our PLM, via in-context learning. Extensive experiments -have demonstrated the effectiveness of our model. - ---------------- - -### 25 Mar 2021 | [Learning to Segment from Scribbles using Multi-scale Adversarial Attention Gates](https://arxiv.org/abs/2007.01152) | [⬇️](https://arxiv.org/pdf/2007.01152) -*Gabriele Valvano, Andrea Leo, Sotirios A. Tsaftaris* - - Large, fine-grained image segmentation datasets, annotated at pixel-level, -are difficult to obtain, particularly in medical imaging, where annotations -also require expert knowledge. Weakly-supervised learning can train models by -relying on weaker forms of annotation, such as scribbles. Here, we learn to -segment using scribble annotations in an adversarial game. With unpaired -segmentation masks, we train a multi-scale GAN to generate realistic -segmentation masks at multiple resolutions, while we use scribbles to learn -their correct position in the image. Central to the model's success is a novel -attention gating mechanism, which we condition with adversarial signals to act -as a shape prior, resulting in better object localization at multiple scales. -Subject to adversarial conditioning, the segmentor learns attention maps that -are semantic, suppress the noisy activations outside the objects, and reduce -the vanishing gradient problem in the deeper layers of the segmentor. We -evaluated our model on several medical (ACDC, LVSC, CHAOS) and non-medical -(PPSS) datasets, and we report performance levels matching those achieved by -models trained with fully annotated segmentation masks. We also demonstrate -extensions in a variety of settings: semi-supervised learning; combining -multiple scribble sources (a crowdsourcing scenario) and multi-task learning -(combining scribble and mask supervision). We release expert-made scribble -annotations for the ACDC dataset, and the code used for the experiments, at -https://vios-s.github.io/multiscale-adversarial-attention-gates - ---------------- - -### 19 Aug 2015 | [Recognizing Extended Spatiotemporal Expressions by Actively Trained Average Perceptron Ensembles](https://arxiv.org/abs/1508.04525) | [⬇️](https://arxiv.org/pdf/1508.04525) -*Wei Zhang, Yang Yu, Osho Gupta, Judith Gelernter* - - Precise geocoding and time normalization for text requires that location and -time phrases be identified. Many state-of-the-art geoparsers and temporal -parsers suffer from low recall. Categories commonly missed by parsers are: -nouns used in a non- spatiotemporal sense, adjectival and adverbial phrases, -prepositional phrases, and numerical phrases. We collected and annotated data -set by querying commercial web searches API with such spatiotemporal -expressions as were missed by state-of-the- art parsers. Due to the high cost -of sentence annotation, active learning was used to label training data, and a -new strategy was designed to better select training examples to reduce labeling -cost. For the learning algorithm, we applied an average perceptron trained -Featurized Hidden Markov Model (FHMM). Five FHMM instances were used to create -an ensemble, with the output phrase selected by voting. Our ensemble model was -tested on a range of sequential labeling tasks, and has shown competitive -performance. Our contributions include (1) an new dataset annotated with named -entities and expanded spatiotemporal expressions; (2) a comparison of inference -algorithms for ensemble models showing the superior accuracy of Belief -Propagation over Viterbi Decoding; (3) a new example re-weighting method for -active ensemble learning that 'memorizes' the latest examples trained; (4) a -spatiotemporal parser that jointly recognizes expanded spatiotemporal -expressions as well as named entities. - ---------------- - -### 23 Nov 2017 | [Knowledge Concentration: Learning 100K Object Classifiers in a Single CNN](https://arxiv.org/abs/1711.07607) | [⬇️](https://arxiv.org/pdf/1711.07607) -*Jiyang Gao, Zijian (James) Guo, Zhen Li, Ram Nevatia* - - Fine-grained image labels are desirable for many computer vision -applications, such as visual search or mobile AI assistant. These applications -rely on image classification models that can produce hundreds of thousands -(e.g. 100K) of diversified fine-grained image labels on input images. However, -training a network at this vocabulary scale is challenging, and suffers from -intolerable large model size and slow training speed, which leads to -unsatisfying classification performance. A straightforward solution would be -training separate expert networks (specialists), with each specialist focusing -on learning one specific vertical (e.g. cars, birds...). However, deploying -dozens of expert networks in a practical system would significantly increase -system complexity and inference latency, and consumes large amounts of -computational resources. To address these challenges, we propose a Knowledge -Concentration method, which effectively transfers the knowledge from dozens of -specialists (multiple teacher networks) into one single model (one student -network) to classify 100K object categories. There are three salient aspects in -our method: (1) a multi-teacher single-student knowledge distillation -framework; (2) a self-paced learning mechanism to allow the student to learn -from different teachers at various paces; (3) structurally connected layers to -expand the student network capacity with limited extra parameters. We validate -our method on OpenImage and a newly collected dataset, Entity-Foto-Tree (EFT), -with 100K categories, and show that the proposed model performs significantly -better than the baseline generalist model. - ---------------- - -### 09 Feb 2023 | [Lightweight Transformers for Clinical Natural Language Processing](https://arxiv.org/abs/2302.04725) | [⬇️](https://arxiv.org/pdf/2302.04725) -*Omid Rohanian, Mohammadmahdi Nouriborji, Hannah Jauncey, Samaneh Kouchaki, ISARIC Clinical Characterisation Group, Lei Clifton, Laura Merson, David A. Clifton* - - Specialised pre-trained language models are becoming more frequent in NLP -since they can potentially outperform models trained on generic texts. BioBERT -and BioClinicalBERT are two examples of such models that have shown promise in -medical NLP tasks. Many of these models are overparametrised and -resource-intensive, but thanks to techniques like Knowledge Distillation (KD), -it is possible to create smaller versions that perform almost as well as their -larger counterparts. In this work, we specifically focus on development of -compact language models for processing clinical texts (i.e. progress notes, -discharge summaries etc). We developed a number of efficient lightweight -clinical transformers using knowledge distillation and continual learning, with -the number of parameters ranging from 15 million to 65 million. These models -performed comparably to larger models such as BioBERT and ClinicalBioBERT and -significantly outperformed other compact models trained on general or -biomedical data. Our extensive evaluation was done across several standard -datasets and covered a wide range of clinical text-mining tasks, including -Natural Language Inference, Relation Extraction, Named Entity Recognition, and -Sequence Classification. To our knowledge, this is the first comprehensive -study specifically focused on creating efficient and compact transformers for -clinical NLP tasks. The models and code used in this study can be found on our -Huggingface profile at https://huggingface.co./nlpie and Github page at -https://github.com/nlpie-research/Lightweight-Clinical-Transformers, -respectively, promoting reproducibility of our results. - ---------------- - - - - -# 8 - - - - - - - - - - - -## Outline with Emojis - -1. 🤖 Persona-based Language Models: Incorporating user-defined personas or character sheets to influence the language model's response style and knowledge base. -2. 🧠 Memory-Augmented Models: Leveraging external memory modules or episodic memory mechanisms to persist context and preferences across conversations. -3. 🔄 Iterative Refinement: Allowing users to provide feedback and iteratively refine the character's personality and expertise through interactive sessions. - -## Keyword Glossary 🔑 - -- 🎭 Persona -- 💾 Memory -- 🔁 Iterative Learning -- 📝 Character Sheet -- 🧠 Context Persistence -- 💻 User Interaction - -```mermaid -graph LR - Persona-->Memory - Persona-->IterativeLearning - CharacterSheet-->Persona - ContextPersistence-->Memory - ContextPersistence-->IterativeLearning - UserInteraction-->IterativeLearning - UserInteraction-->CharacterSheet -``` - -```python -import streamlit as st -import pandas as pd - -st.title("Character Personality and Constitution") - -# Define the glossary terms -glossary = { - "Persona": "A predefined set of traits, preferences, and knowledge that shapes the language model's response style.", - "Memory": "An external or internal mechanism for storing and retrieving context and preferences across conversations.", - "Iterative Learning": "The process of refining the character's personality and expertise through user feedback and interactive sessions.", - "Character Sheet": "A user-defined representation of the character's personality, traits, and expertise.", - "Context Persistence": "The ability to maintain and apply context and preferences across multiple interactions.", - "User Interaction": "The means by which users can provide feedback, modify the character sheet, and influence the character's personality and expertise." -} - -# Display the glossary terms -st.header("Glossary") -for term, definition in glossary.items(): - st.subheader(term) - st.write(definition) - -# Allow users to modify the character sheet -st.header("Character Sheet") -character_sheet = st.text_area("Enter your character sheet details:") - -# Display the user-defined character sheet -if character_sheet: - st.subheader("Your Character Sheet") - st.write(character_sheet) - -# Simulate the application of the character sheet and context persistence -st.header("Interaction") -user_input = st.text_input("Enter your message:") -if user_input: - # Simulated response based on the character sheet and context - response = f"Based on your character sheet and context, my response is: {user_input.upper()}" - st.write(response) -``` - - -# 🩺🔍 Search Results -### 22 Oct 2023 | [MIRACLE: Towards Personalized Dialogue Generation with Latent-Space Multiple Personal Attribute Control](https://arxiv.org/abs/2310.18342) | [⬇️](https://arxiv.org/pdf/2310.18342) -*Zhenyi Lu, Wei Wei, Xiaoye Qu, XianLing Mao, Dangyang Chen, Jixiong Chen* - - Personalized dialogue systems aim to endow the chatbot agent with more -anthropomorphic traits for human-like interactions. Previous approaches have -explored explicitly user profile modeling using text descriptions, implicit -derivation of user embeddings, or utilizing handicraft prompts for ChatGPT-like -models. However, textual personas are limited in describing multi-faceted -attributes (\emph{e.g.}, \emph{language style, inner character nuances}), -implicit embedding suffers from personality sparsity, and handicraft prompts -lack fine-grained and stable controllability. Hence, these approaches may -struggle with complex personalized dialogue generation tasks that require -generating controllable responses with multiple personal attributes. To this -end, we propose \textbf{\textsc{Miracle}}, a novel personalized dialogue -generation method through \textbf{M}ult\textbf{I}ple Pe\textbf{R}sonal -\textbf{A}ttributes \textbf{C}ontrol within \textbf{L}atent-Space -\textbf{E}nergy-based Models. ttributes \textbf{C}ontrol within -\textbf{L}atent-Space \textbf{E}nergy-based Models. Specifically, our approach -first disentangles complex personality into multi-faceted attributes. -Subsequently, we employ a conditional variational auto-encoder to align with -the dense personalized responses within a latent joint attribute space. We have -also tailored a dedicated energy function and customized the ordinary -differential equations sampling method to offer flexible attribute composition -and precise attribute control. Extensive experiments demonstrate that -\textsc{Miracle} outperforms several strong baselines in terms of personality -controllability and response generation quality. Our dataset and code are -available at \url{https://github.com/LZY-the-boys/MIRACLE} - ---------------- - -### 12 Dec 2022 | [ABINet++: Autonomous, Bidirectional and Iterative Language Modeling for Scene Text Spotting](https://arxiv.org/abs/2211.10578) | [⬇️](https://arxiv.org/pdf/2211.10578) -*Shancheng Fang, Zhendong Mao, Hongtao Xie, Yuxin Wang, Chenggang Yan, Yongdong Zhang* - - Scene text spotting is of great importance to the computer vision community -due to its wide variety of applications. Recent methods attempt to introduce -linguistic knowledge for challenging recognition rather than pure visual -classification. However, how to effectively model the linguistic rules in -end-to-end deep networks remains a research challenge. In this paper, we argue -that the limited capacity of language models comes from 1) implicit language -modeling; 2) unidirectional feature representation; and 3) language model with -noise input. Correspondingly, we propose an autonomous, bidirectional and -iterative ABINet++ for scene text spotting. Firstly, the autonomous suggests -enforcing explicitly language modeling by decoupling the recognizer into vision -model and language model and blocking gradient flow between both models. -Secondly, a novel bidirectional cloze network (BCN) as the language model is -proposed based on bidirectional feature representation. Thirdly, we propose an -execution manner of iterative correction for the language model which can -effectively alleviate the impact of noise input. Finally, to polish ABINet++ in -long text recognition, we propose to aggregate horizontal features by embedding -Transformer units inside a U-Net, and design a position and content attention -module which integrates character order and content to attend to character -features precisely. ABINet++ achieves state-of-the-art performance on both -scene text recognition and scene text spotting benchmarks, which consistently -demonstrates the superiority of our method in various environments especially -on low-quality images. Besides, extensive experiments including in English and -Chinese also prove that, a text spotter that incorporates our language modeling -method can significantly improve its performance both in accuracy and speed -compared with commonly used attention-based recognizers. - ---------------- - -### 12 Nov 2023 | [ChatAnything: Facetime Chat with LLM-Enhanced Personas](https://arxiv.org/abs/2311.06772) | [⬇️](https://arxiv.org/pdf/2311.06772) -*Yilin Zhao, Xinbin Yuan, Shanghua Gao, Zhijie Lin, Qibin Hou, Jiashi Feng, Daquan Zhou* - - In this technical report, we target generating anthropomorphized personas for -LLM-based characters in an online manner, including visual appearance, -personality and tones, with only text descriptions. To achieve this, we first -leverage the in-context learning capability of LLMs for personality generation -by carefully designing a set of system prompts. We then propose two novel -concepts: the mixture of voices (MoV) and the mixture of diffusers (MoD) for -diverse voice and appearance generation. For MoV, we utilize the text-to-speech -(TTS) algorithms with a variety of pre-defined tones and select the most -matching one based on the user-provided text description automatically. For -MoD, we combine the recent popular text-to-image generation techniques and -talking head algorithms to streamline the process of generating talking -objects. We termed the whole framework as ChatAnything. With it, users could be -able to animate anything with any personas that are anthropomorphic using just -a few text inputs. However, we have observed that the anthropomorphic objects -produced by current generative models are often undetectable by pre-trained -face landmark detectors, leading to failure of the face motion generation, even -if these faces possess human-like appearances because those images are nearly -seen during the training (e.g., OOD samples). To address this issue, we -incorporate pixel-level guidance to infuse human face landmarks during the -image generation phase. To benchmark these metrics, we have built an evaluation -dataset. Based on it, we verify that the detection rate of the face landmark is -significantly increased from 57.0% to 92.5% thus allowing automatic face -animation based on generated speech content. The code and more results can be -found at https://chatanything.github.io/. - ---------------- - -### 23 Feb 2019 | [textTOvec: Deep Contextualized Neural Autoregressive Topic Models of Language with Distributed Compositional Prior](https://arxiv.org/abs/1810.03947) | [⬇️](https://arxiv.org/pdf/1810.03947) -*Pankaj Gupta and Yatin Chaudhary and Florian Buettner and Hinrich Sch\"utze* - - We address two challenges of probabilistic topic modelling in order to better -estimate the probability of a word in a given context, i.e., P(word|context): -(1) No Language Structure in Context: Probabilistic topic models ignore word -order by summarizing a given context as a "bag-of-word" and consequently the -semantics of words in the context is lost. The LSTM-LM learns a vector-space -representation of each word by accounting for word order in local collocation -patterns and models complex characteristics of language (e.g., syntax and -semantics), while the TM simultaneously learns a latent representation from the -entire document and discovers the underlying thematic structure. We unite two -complementary paradigms of learning the meaning of word occurrences by -combining a TM (e.g., DocNADE) and a LM in a unified probabilistic framework, -named as ctx-DocNADE. (2) Limited Context and/or Smaller training corpus of -documents: In settings with a small number of word occurrences (i.e., lack of -context) in short text or data sparsity in a corpus of few documents, the -application of TMs is challenging. We address this challenge by incorporating -external knowledge into neural autoregressive topic models via a language -modelling approach: we use word embeddings as input of a LSTM-LM with the aim -to improve the word-topic mapping on a smaller and/or short-text corpus. The -proposed DocNADE extension is named as ctx-DocNADEe. - We present novel neural autoregressive topic model variants coupled with -neural LMs and embeddings priors that consistently outperform state-of-the-art -generative TMs in terms of generalization (perplexity), interpretability (topic -coherence) and applicability (retrieval and classification) over 6 long-text -and 8 short-text datasets from diverse domains. - ---------------- - -### 21 Dec 2023 | [Exploiting Contextual Target Attributes for Target Sentiment Classification](https://arxiv.org/abs/2312.13766) | [⬇️](https://arxiv.org/pdf/2312.13766) -*Bowen Xing and Ivor W. Tsang* - - Existing PTLM-based models for TSC can be categorized into two groups: 1) -fine-tuning-based models that adopt PTLM as the context encoder; 2) -prompting-based models that transfer the classification task to the text/word -generation task. In this paper, we present a new perspective of leveraging PTLM -for TSC: simultaneously leveraging the merits of both language modeling and -explicit target-context interactions via contextual target attributes. -Specifically, we design the domain- and target-constrained cloze test, which -can leverage the PTLMs' strong language modeling ability to generate the given -target's attributes pertaining to the review context. The attributes contain -the background and property information of the target, which can help to enrich -the semantics of the review context and the target. To exploit the attributes -for tackling TSC, we first construct a heterogeneous information graph by -treating the attributes as nodes and combining them with (1) the syntax graph -automatically produced by the off-the-shelf dependency parser and (2) the -semantics graph of the review context, which is derived from the self-attention -mechanism. Then we propose a heterogeneous information gated graph -convolutional network to model the interactions among the attribute -information, the syntactic information, and the contextual information. The -experimental results on three benchmark datasets demonstrate the superiority of -our model, which achieves new state-of-the-art performance. - ---------------- - -### 22 Feb 2024 | [Learning High-Quality and General-Purpose Phrase Representations](https://arxiv.org/abs/2401.10407) | [⬇️](https://arxiv.org/pdf/2401.10407) -*Lihu Chen and Ga\"el Varoquaux and Fabian M. Suchanek* - - Phrase representations play an important role in data science and natural -language processing, benefiting various tasks like Entity Alignment, Record -Linkage, Fuzzy Joins, and Paraphrase Classification. The current -state-of-the-art method involves fine-tuning pre-trained language models for -phrasal embeddings using contrastive learning. However, we have identified -areas for improvement. First, these pre-trained models tend to be unnecessarily -complex and require to be pre-trained on a corpus with context sentences. -Second, leveraging the phrase type and morphology gives phrase representations -that are both more precise and more flexible. We propose an improved framework -to learn phrase representations in a context-free fashion. The framework -employs phrase type classification as an auxiliary task and incorporates -character-level information more effectively into the phrase representation. -Furthermore, we design three granularities of data augmentation to increase the -diversity of training samples. Our experiments across a wide range of tasks -show that our approach generates superior phrase embeddings compared to -previous methods while requiring a smaller model size. [PEARL-small]: -https://huggingface.co./Lihuchen/pearl_small; [PEARL-base]: -https://huggingface.co./Lihuchen/pearl_base; [Code and Dataset]: -https://github.com/tigerchen52/PEARL - ---------------- - -### 07 Nov 2023 | [Personality Style Recognition via Machine Learning: Identifying Anaclitic and Introjective Personality Styles from Patients' Speech](https://arxiv.org/abs/2311.04088) | [⬇️](https://arxiv.org/pdf/2311.04088) -*Semere Kiros Bitew, Vincent Schelstraete, Klim Zaporojets, Kimberly Van Nieuwenhove, Reitske Meganck and Chris Develder* - - In disentangling the heterogeneity observed in psychopathology, personality -of the patients is considered crucial. While it has been demonstrated that -personality traits are reflected in the language used by a patient, we -hypothesize that this enables automatic inference of the personality type -directly from speech utterances, potentially more accurately than through a -traditional questionnaire-based approach explicitly designed for personality -classification. To validate this hypothesis, we adopt natural language -processing (NLP) and standard machine learning tools for classification. We -test this on a dataset of recorded clinical diagnostic interviews (CDI) on a -sample of 79 patients diagnosed with major depressive disorder (MDD) -- a -condition for which differentiated treatment based on personality styles has -been advocated -- and classified into anaclitic and introjective personality -styles. We start by analyzing the interviews to see which linguistic features -are associated with each style, in order to gain a better understanding of the -styles. Then, we develop automatic classifiers based on (a) standardized -questionnaire responses; (b) basic text features, i.e., TF-IDF scores of words -and word sequences; (c) more advanced text features, using LIWC (linguistic -inquiry and word count) and context-aware features using BERT (bidirectional -encoder representations from transformers); (d) audio features. We find that -automated classification with language-derived features (i.e., based on LIWC) -significantly outperforms questionnaire-based classification models. -Furthermore, the best performance is achieved by combining LIWC with the -questionnaire features. This suggests that more work should be put into -developing linguistically based automated techniques for characterizing -personality, however questionnaires still to some extent complement such -methods. - ---------------- - -### 13 Mar 2019 | [Persona-Aware Tips Generation](https://arxiv.org/abs/1903.02156) | [⬇️](https://arxiv.org/pdf/1903.02156) -*Piji Li, Zihao Wang, Lidong Bing, Wai Lam* - - Tips, as a compacted and concise form of reviews, were paid less attention by -researchers. In this paper, we investigate the task of tips generation by -considering the `persona' information which captures the intrinsic language -style of the users or the different characteristics of the product items. In -order to exploit the persona information, we propose a framework based on -adversarial variational auto-encoders (aVAE) for persona modeling from the -historical tips and reviews of users and items. The latent variables from aVAE -are regarded as persona embeddings. Besides representing persona using the -latent embeddings, we design a persona memory for storing the persona related -words for users and items. Pointer Network is used to retrieve persona wordings -from the memory when generating tips. Moreover, the persona embeddings are used -as latent factors by a rating prediction component to predict the sentiment of -a user over an item. Finally, the persona embeddings and the sentiment -information are incorporated into a recurrent neural networks based tips -generation component. Extensive experimental results are reported and discussed -to elaborate the peculiarities of our framework. - ---------------- - -### 08 Dec 2023 | [User-Aware Prefix-Tuning is a Good Learner for Personalized Image Captioning](https://arxiv.org/abs/2312.04793) | [⬇️](https://arxiv.org/pdf/2312.04793) -*Xuan Wang, Guanhong Wang, Wenhao Chai, Jiayu Zhou, and Gaoang Wang* - - Image captioning bridges the gap between vision and language by automatically -generating natural language descriptions for images. Traditional image -captioning methods often overlook the preferences and characteristics of users. -Personalized image captioning solves this problem by incorporating user prior -knowledge into the model, such as writing styles and preferred vocabularies. -Most existing methods emphasize the user context fusion process by memory -networks or transformers. However, these methods ignore the distinct domains of -each dataset. Therefore, they need to update the entire caption model -parameters when meeting new samples, which is time-consuming and -calculation-intensive. To address this challenge, we propose a novel -personalized image captioning framework that leverages user context to consider -personality factors. Additionally, our framework utilizes the prefix-tuning -paradigm to extract knowledge from a frozen large language model, reducing the -gap between different language domains. Specifically, we employ CLIP to extract -the visual features of an image and align the semantic space using a -query-guided mapping network. By incorporating the transformer layer, we merge -the visual features with the user's contextual prior knowledge to generate -informative prefixes. Moreover, we employ GPT-2 as the frozen large language -model. With a small number of parameters to be trained, our model performs -efficiently and effectively. Our model outperforms existing baseline models on -Instagram and YFCC100M datasets across five evaluation metrics, demonstrating -its superiority, including twofold improvements in metrics such as BLEU-4 and -CIDEr. - ---------------- - -### 10 Jul 2023 | [AmadeusGPT: a natural language interface for interactive animal behavioral analysis](https://arxiv.org/abs/2307.04858) | [⬇️](https://arxiv.org/pdf/2307.04858) -*Shaokai Ye, Jessy Lauer, Mu Zhou, Alexander Mathis, Mackenzie W. Mathis* - - The process of quantifying and analyzing animal behavior involves translating -the naturally occurring descriptive language of their actions into -machine-readable code. Yet, codifying behavior analysis is often challenging -without deep understanding of animal behavior and technical machine learning -knowledge. To limit this gap, we introduce AmadeusGPT: a natural language -interface that turns natural language descriptions of behaviors into -machine-executable code. Large-language models (LLMs) such as GPT3.5 and GPT4 -allow for interactive language-based queries that are potentially well suited -for making interactive behavior analysis. However, the comprehension capability -of these LLMs is limited by the context window size, which prevents it from -remembering distant conversations. To overcome the context window limitation, -we implement a novel dual-memory mechanism to allow communication between -short-term and long-term memory using symbols as context pointers for retrieval -and saving. Concretely, users directly use language-based definitions of -behavior and our augmented GPT develops code based on the core AmadeusGPT API, -which contains machine learning, computer vision, spatio-temporal reasoning, -and visualization modules. Users then can interactively refine results, and -seamlessly add new behavioral modules as needed. We benchmark AmadeusGPT and -show we can produce state-of-the-art performance on the MABE 2022 behavior -challenge tasks. Note, an end-user would not need to write any code to achieve -this. Thus, collectively AmadeusGPT presents a novel way to merge deep -biological knowledge, large-language models, and core computer vision modules -into a more naturally intelligent system. Code and demos can be found at: -https://github.com/AdaptiveMotorControlLab/AmadeusGPT. - ---------------- - -### 02 Sep 2021 | [One Chatbot Per Person: Creating Personalized Chatbots based on Implicit User Profiles](https://arxiv.org/abs/2108.09355) | [⬇️](https://arxiv.org/pdf/2108.09355) -*Zhengyi Ma, Zhicheng Dou, Yutao Zhu, Hanxun Zhong, Ji-Rong Wen* - - Personalized chatbots focus on endowing chatbots with a consistent -personality to behave like real users, give more informative responses, and -further act as personal assistants. Existing personalized approaches tried to -incorporate several text descriptions as explicit user profiles. However, the -acquisition of such explicit profiles is expensive and time-consuming, thus -being impractical for large-scale real-world applications. Moreover, the -restricted predefined profile neglects the language behavior of a real user and -cannot be automatically updated together with the change of user interests. In -this paper, we propose to learn implicit user profiles automatically from -large-scale user dialogue history for building personalized chatbots. -Specifically, leveraging the benefits of Transformer on language understanding, -we train a personalized language model to construct a general user profile from -the user's historical responses. To highlight the relevant historical responses -to the input post, we further establish a key-value memory network of -historical post-response pairs, and build a dynamic post-aware user profile. -The dynamic profile mainly describes what and how the user has responded to -similar posts in history. To explicitly utilize users' frequently used words, -we design a personalized decoder to fuse two decoding strategies, including -generating a word from the generic vocabulary and copying one word from the -user's personalized vocabulary. Experiments on two real-world datasets show the -significant improvement of our model compared with existing methods. Our code -is available at https://github.com/zhengyima/DHAP - ---------------- - -### 18 Nov 2016 | [Word and Document Embeddings based on Neural Network Approaches](https://arxiv.org/abs/1611.05962) | [⬇️](https://arxiv.org/pdf/1611.05962) -*Siwei Lai* - - Data representation is a fundamental task in machine learning. The -representation of data affects the performance of the whole machine learning -system. In a long history, the representation of data is done by feature -engineering, and researchers aim at designing better features for specific -tasks. Recently, the rapid development of deep learning and representation -learning has brought new inspiration to various domains. - In natural language processing, the most widely used feature representation -is the Bag-of-Words model. This model has the data sparsity problem and cannot -keep the word order information. Other features such as part-of-speech tagging -or more complex syntax features can only fit for specific tasks in most cases. -This thesis focuses on word representation and document representation. We -compare the existing systems and present our new model. - First, for generating word embeddings, we make comprehensive comparisons -among existing word embedding models. In terms of theory, we figure out the -relationship between the two most important models, i.e., Skip-gram and GloVe. -In our experiments, we analyze three key points in generating word embeddings, -including the model construction, the training corpus and parameter design. We -evaluate word embeddings with three types of tasks, and we argue that they -cover the existing use of word embeddings. Through theory and practical -experiments, we present some guidelines for how to generate a good word -embedding. - Second, in Chinese character or word representation. We introduce the joint -training of Chinese character and word. ... - Third, for document representation, we analyze the existing document -representation models, including recursive NNs, recurrent NNs and convolutional -NNs. We point out the drawbacks of these models and present our new model, the -recurrent convolutional neural networks. ... - ---------------- - -### 01 Oct 2023 | [RoleLLM: Benchmarking, Eliciting, and Enhancing Role-Playing Abilities of Large Language Models](https://arxiv.org/abs/2310.00746) | [⬇️](https://arxiv.org/pdf/2310.00746) -*Zekun Moore Wang, Zhongyuan Peng, Haoran Que, Jiaheng Liu, Wangchunshu Zhou, Yuhan Wu, Hongcheng Guo, Ruitong Gan, Zehao Ni, Man Zhang, Zhaoxiang Zhang, Wanli Ouyang, Ke Xu, Wenhu Chen, Jie Fu, Junran Peng* - - The advent of Large Language Models (LLMs) has paved the way for complex -tasks such as role-playing, which enhances user interactions by enabling models -to imitate various characters. However, the closed-source nature of -state-of-the-art LLMs and their general-purpose training limit role-playing -optimization. In this paper, we introduce RoleLLM, a framework to benchmark, -elicit, and enhance role-playing abilities in LLMs. RoleLLM comprises four -stages: (1) Role Profile Construction for 100 roles; (2) Context-Based -Instruction Generation (Context-Instruct) for role-specific knowledge -extraction; (3) Role Prompting using GPT (RoleGPT) for speaking style -imitation; and (4) Role-Conditioned Instruction Tuning (RoCIT) for fine-tuning -open-source models along with role customization. By Context-Instruct and -RoleGPT, we create RoleBench, the first systematic and fine-grained -character-level benchmark dataset for role-playing with 168,093 samples. -Moreover, RoCIT on RoleBench yields RoleLLaMA (English) and RoleGLM (Chinese), -significantly enhancing role-playing abilities and even achieving comparable -results with RoleGPT (using GPT-4). - ---------------- - -### 05 Mar 2021 | [Enhanced Aspect-Based Sentiment Analysis Models with Progressive Self-supervised Attention Learning](https://arxiv.org/abs/2103.03446) | [⬇️](https://arxiv.org/pdf/2103.03446) -*Jinsong Su, Jialong Tang, Hui Jiang, Ziyao Lu, Yubin Ge, Linfeng Song, Deyi Xiong, Le Sun, Jiebo Luo* - - In aspect-based sentiment analysis (ABSA), many neural models are equipped -with an attention mechanism to quantify the contribution of each context word -to sentiment prediction. However, such a mechanism suffers from one drawback: -only a few frequent words with sentiment polarities are tended to be taken into -consideration for final sentiment decision while abundant infrequent sentiment -words are ignored by models. To deal with this issue, we propose a progressive -self-supervised attention learning approach for attentional ABSA models. In -this approach, we iteratively perform sentiment prediction on all training -instances, and continually learn useful attention supervision information in -the meantime. During training, at each iteration, context words with the -highest impact on sentiment prediction, identified based on their attention -weights or gradients, are extracted as words with active/misleading influence -on the correct/incorrect prediction for each instance. Words extracted in this -way are masked for subsequent iterations. To exploit these extracted words for -refining ABSA models, we augment the conventional training objective with a -regularization term that encourages ABSA models to not only take full advantage -of the extracted active context words but also decrease the weights of those -misleading words. We integrate the proposed approach into three -state-of-the-art neural ABSA models. Experiment results and in-depth analyses -show that our approach yields better attention results and significantly -enhances the performance of all three models. We release the source code and -trained models at https://github.com/DeepLearnXMU/PSSAttention. - ---------------- - -### 28 Sep 2023 | [TPE: Towards Better Compositional Reasoning over Conceptual Tools with Multi-persona Collaboration](https://arxiv.org/abs/2309.16090) | [⬇️](https://arxiv.org/pdf/2309.16090) -*Hongru Wang, Huimin Wang, Lingzhi Wang, Minda Hu, Rui Wang, Boyang Xue, Hongyuan Lu, Fei Mi, Kam-Fai Wong* - - Large language models (LLMs) have demonstrated exceptional performance in -planning the use of various functional tools, such as calculators and -retrievers, particularly in question-answering tasks. In this paper, we expand -the definition of these tools, centering on conceptual tools within the context -of dialogue systems. A conceptual tool specifies a cognitive concept that aids -systematic or investigative thought. These conceptual tools play important -roles in practice, such as multiple psychological or tutoring strategies being -dynamically applied in a single turn to compose helpful responses. To further -enhance the reasoning and planning capability of LLMs with these conceptual -tools, we introduce a multi-persona collaboration framework: Think-Plan-Execute -(TPE). This framework decouples the response generation process into three -distinct roles: Thinker, Planner, and Executor. Specifically, the Thinker -analyzes the internal status exhibited in the dialogue context, such as user -emotions and preferences, to formulate a global guideline. The Planner then -generates executable plans to call different conceptual tools (e.g., sources or -strategies), while the Executor compiles all intermediate results into a -coherent response. This structured approach not only enhances the -explainability and controllability of responses but also reduces token -redundancy. We demonstrate the effectiveness of TPE across various dialogue -response generation tasks, including multi-source (FoCus) and multi-strategy -interactions (CIMA and PsyQA). This reveals its potential to handle real-world -dialogue interactions that require more complicated tool learning beyond just -functional tools. The full code and data will be released for reproduction. - ---------------- - -### 24 Jan 2024 | [UniMS-RAG: A Unified Multi-source Retrieval-Augmented Generation for Personalized Dialogue Systems](https://arxiv.org/abs/2401.13256) | [⬇️](https://arxiv.org/pdf/2401.13256) -*Hongru Wang, Wenyu Huang, Yang Deng, Rui Wang, Zezhong Wang, Yufei Wang, Fei Mi, Jeff Z. Pan, Kam-Fai Wong* - - Large Language Models (LLMs) has shown exceptional capabilities in many -natual language understanding and generation tasks. However, the -personalization issue still remains a much-coveted property, especially when it -comes to the multiple sources involved in the dialogue system. To better plan -and incorporate the use of multiple sources in generating personalized -response, we firstly decompose it into three sub-tasks: Knowledge Source -Selection, Knowledge Retrieval, and Response Generation. We then propose a -novel Unified Multi-Source Retrieval-Augmented Generation system (UniMS-RAG) -Specifically, we unify these three sub-tasks with different formulations into -the same sequence-to-sequence paradigm during the training, to adaptively -retrieve evidences and evaluate the relevance on-demand using special tokens, -called acting tokens and evaluation tokens. Enabling language models to -generate acting tokens facilitates interaction with various knowledge sources, -allowing them to adapt their behavior to diverse task requirements. Meanwhile, -evaluation tokens gauge the relevance score between the dialogue context and -the retrieved evidence. In addition, we carefully design a self-refinement -mechanism to iteratively refine the generated response considering 1) the -consistency scores between the generated response and retrieved evidence; and -2) the relevance scores. Experiments on two personalized datasets (DuLeMon and -KBP) show that UniMS-RAG achieves state-of-the-art performance on the knowledge -source selection and response generation task with itself as a retriever in a -unified manner. Extensive analyses and discussions are provided for shedding -some new perspectives for personalized dialogue systems. - ---------------- - -### 03 Nov 2020 | [CharBERT: Character-aware Pre-trained Language Model](https://arxiv.org/abs/2011.01513) | [⬇️](https://arxiv.org/pdf/2011.01513) -*Wentao Ma, Yiming Cui, Chenglei Si, Ting Liu, Shijin Wang, Guoping Hu* - - Most pre-trained language models (PLMs) construct word representations at -subword level with Byte-Pair Encoding (BPE) or its variations, by which OOV -(out-of-vocab) words are almost avoidable. However, those methods split a word -into subword units and make the representation incomplete and fragile. In this -paper, we propose a character-aware pre-trained language model named CharBERT -improving on the previous methods (such as BERT, RoBERTa) to tackle these -problems. We first construct the contextual word embedding for each token from -the sequential character representations, then fuse the representations of -characters and the subword representations by a novel heterogeneous interaction -module. We also propose a new pre-training task named NLM (Noisy LM) for -unsupervised character representation learning. We evaluate our method on -question answering, sequence labeling, and text classification tasks, both on -the original datasets and adversarial misspelling test sets. The experimental -results show that our method can significantly improve the performance and -robustness of PLMs simultaneously. Pretrained models, evaluation sets, and code -are available at https://github.com/wtma/CharBERT - ---------------- - -### 22 Feb 2018 | [Multimodal Named Entity Recognition for Short Social Media Posts](https://arxiv.org/abs/1802.07862) | [⬇️](https://arxiv.org/pdf/1802.07862) -*Seungwhan Moon, Leonardo Neves, Vitor Carvalho* - - We introduce a new task called Multimodal Named Entity Recognition (MNER) for -noisy user-generated data such as tweets or Snapchat captions, which comprise -short text with accompanying images. These social media posts often come in -inconsistent or incomplete syntax and lexical notations with very limited -surrounding textual contexts, bringing significant challenges for NER. To this -end, we create a new dataset for MNER called SnapCaptions (Snapchat -image-caption pairs submitted to public and crowd-sourced stories with fully -annotated named entities). We then build upon the state-of-the-art Bi-LSTM -word/character based NER models with 1) a deep image network which incorporates -relevant visual context to augment textual information, and 2) a generic -modality-attention module which learns to attenuate irrelevant modalities while -amplifying the most informative ones to extract contexts from, adaptive to each -sample and token. The proposed MNER model with modality attention significantly -outperforms the state-of-the-art text-only NER models by successfully -leveraging provided visual contexts, opening up potential applications of MNER -on myriads of social media platforms. - ---------------- - -### 19 Aug 2023 | [HICL: Hashtag-Driven In-Context Learning for Social Media Natural Language Understanding](https://arxiv.org/abs/2308.09985) | [⬇️](https://arxiv.org/pdf/2308.09985) -*Hanzhuo Tan, Chunpu Xu, Jing Li, Yuqun Zhang, Zeyang Fang, Zeyu Chen, Baohua Lai* - - Natural language understanding (NLU) is integral to various social media -applications. However, existing NLU models rely heavily on context for semantic -learning, resulting in compromised performance when faced with short and noisy -social media content. To address this issue, we leverage in-context learning -(ICL), wherein language models learn to make inferences by conditioning on a -handful of demonstrations to enrich the context and propose a novel -hashtag-driven in-context learning (HICL) framework. Concretely, we pre-train a -model #Encoder, which employs #hashtags (user-annotated topic labels) to drive -BERT-based pre-training through contrastive learning. Our objective here is to -enable #Encoder to gain the ability to incorporate topic-related semantic -information, which allows it to retrieve topic-related posts to enrich contexts -and enhance social media NLU with noisy contexts. To further integrate the -retrieved context with the source text, we employ a gradient-based method to -identify trigger terms useful in fusing information from both sources. For -empirical studies, we collected 45M tweets to set up an in-context NLU -benchmark, and the experimental results on seven downstream tasks show that -HICL substantially advances the previous state-of-the-art results. Furthermore, -we conducted extensive analyzes and found that: (1) combining source input with -a top-retrieved post from #Encoder is more effective than using semantically -similar posts; (2) trigger words can largely benefit in merging context from -the source and retrieved posts. - ---------------- - -### 21 Jan 2023 | [REDAffectiveLM: Leveraging Affect Enriched Embedding and Transformer-based Neural Language Model for Readers' Emotion Detection](https://arxiv.org/abs/2301.08995) | [⬇️](https://arxiv.org/pdf/2301.08995) -*Anoop Kadan, Deepak P., Manjary P. Gangan, Savitha Sam Abraham, Lajish V. L* - - Technological advancements in web platforms allow people to express and share -emotions towards textual write-ups written and shared by others. This brings -about different interesting domains for analysis; emotion expressed by the -writer and emotion elicited from the readers. In this paper, we propose a novel -approach for Readers' Emotion Detection from short-text documents using a deep -learning model called REDAffectiveLM. Within state-of-the-art NLP tasks, it is -well understood that utilizing context-specific representations from -transformer-based pre-trained language models helps achieve improved -performance. Within this affective computing task, we explore how incorporating -affective information can further enhance performance. Towards this, we -leverage context-specific and affect enriched representations by using a -transformer-based pre-trained language model in tandem with affect enriched -Bi-LSTM+Attention. For empirical evaluation, we procure a new dataset REN-20k, -besides using RENh-4k and SemEval-2007. We evaluate the performance of our -REDAffectiveLM rigorously across these datasets, against a vast set of -state-of-the-art baselines, where our model consistently outperforms baselines -and obtains statistically significant results. Our results establish that -utilizing affect enriched representation along with context-specific -representation within a neural architecture can considerably enhance readers' -emotion detection. Since the impact of affect enrichment specifically in -readers' emotion detection isn't well explored, we conduct a detailed analysis -over affect enriched Bi-LSTM+Attention using qualitative and quantitative model -behavior evaluation techniques. We observe that compared to conventional -semantic embedding, affect enriched embedding increases ability of the network -to effectively identify and assign weightage to key terms responsible for -readers' emotion detection. - ---------------- - - - - - - - -# 9 - - -1. 🧠 Neural Module Networks for Reasoning - - Combining neural networks with symbolic reasoning - - Applicable to Telemed for diagnosis and treatment planning - -2. 🤖 Neuro-Symbolic AI for Reasoning - - Integrating neural networks with logical reasoning - - Useful for AI-assisted medical decision-making - -3. 🔍 Reasoning with Transformer Models - - Large language models for multi-step reasoning tasks - - Applicable to AI-assisted medical question answering - -### Keywords Glossary - -- 🧠 Neural Module Networks -- 🤖 Neuro-Symbolic AI -- 🔍 Transformer Models -- 📊 Graph Neural Networks -- 🔬 Inductive Logic Programming -- 🧭 Reasoning Templates -- 🧩 Modular Reasoning - -```mermaid -graph TD - A[Neural Module Networks] -->|Combines| B[Neuro-Symbolic AI] - B --> C[Transformer Models] - C --> D[Graph Neural Networks] - D --> E[Inductive Logic Programming] - E --> F[Reasoning Templates] - F --> G[Modular Reasoning] -``` - -```python -# app.py -import streamlit as st - -st.title("Reasoning Techniques") - -st.header("State of the Art Techniques") -st.write("1. 🧠 Neural Module Networks for Reasoning") -st.write("2. 🤖 Neuro-Symbolic AI for Reasoning") -st.write("3. 🔍 Reasoning with Transformer Models") - -st.header("Keywords Glossary") -st.write("- 🧠 Neural Module Networks") -st.write("- 🤖 Neuro-Symbolic AI") -st.write("- 🔍 Transformer Models") -st.write("- 📊 Graph Neural Networks") -st.write("- 🔬 Inductive Logic Programming") -st.write("- 🧭 Reasoning Templates") -st.write("- 🧩 Modular Reasoning") - -st.header("Mermaid Diagram") -st.mermaid(""" -graph TD - A[Neural Module Networks] -->|Combines| B[Neuro-Symbolic AI] - B --> C[Transformer Models] - C --> D[Graph Neural Networks] - D --> E[Inductive Logic Programming] - E --> F[Reasoning Templates] - F --> G[Modular Reasoning] -""") -``` - - - - -''' - -st.markdown(SlideDeck) \ No newline at end of file