diff --git a/app/page.tsx b/app/page.tsx index eb913f1..595ee61 100644 --- a/app/page.tsx +++ b/app/page.tsx @@ -20,6 +20,7 @@ import sotopiaTown from '../public/island_only.svg' import agentvsstory from '../public/projects/agent_vs_storyteller.jpg' import sotopiaPi from '../public/projects/sotopia_pi.jpg' import sotopia from '../public/projects/sotopia.jpg' +import ai_liedar from '../public/projects/ai_liedar/intro_conversation.png' import { ModeToggle } from "@/components/ui/model-toggle"; import { ThemeProvider } from "@/components/theme-provider"; import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar"; @@ -281,6 +282,18 @@ export default function Home() { Latest Research +
+ + AI-LieDar: Examine the Trade-off Between Utility and Truthfulness in LLM Agents

} + /> +

+ TL;DR: AI-LieDar presents LLMs with carefully crafted scenarios to see how they handle these truth-vs-helpfulness dilemmas. +

+ +
& React.HTMLAttributes) { + return
+} +function SectionText(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.HTMLAttributes) { + return

+} +function SectionTitle(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.HTMLAttributes) { + return

+} +function SectionSubtitle(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.HTMLAttributes) { + return

+} +function Paragraph(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.HTMLAttributes) { + return

+} +function OrderedList(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.OlHTMLAttributes) { + return

    +} + +function ListItem(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.LiHTMLAttributes) { + return
  1. +} + + +function Title() { + return ( + <> +

    + AI-LieDar: Examine the Trade-off Between Utility and Truthfulness in LLM Agents +

    +
    +
    Zhe Su1
    +
    Xuhui Zhou1
    +
    Sanketh Rangreji1
    +
    Anubha Kabra1
    +
    Julia Mendelsohn2
    +
    Faeze Brahman3
    +
    Maarten Sap1
    +
    +
    + 1Carnegie Mellon University + 2University of Michigan, Ann Arbor + 3Allen Institute for AI +
    + + + ); +} + +export default function Index() { + const scrollContainerRef = React.useRef(null); + return ( +
    + + + +

    Research.

    +
    + + +
    + +
    +
    +
    + +
    +
    +
    + + {/* +
    + +
    +
    + */} + + + When AI Helpfulness Meets Truthfulness + The Dilemma in Instruction - Truthful or Utility? + + When deploying our LLM-based agents, we want an AI assistant to be: + + Helpful - providing useful information and assistance to achieve the user's goals (i.e. achieving Utility) + Truthful - offering accurate and honest responses +
    + {/* 1. Helpful - providing useful information and assistance to achieve the user's goals (i.e. achieving Utility)
    + 2. Truthful - offering accurate and honest responses + However, these two qualities can sometimes conflict.
    */} + Imagine chatting with an AI shopping assistant designed to help you pick the best product. This seemingly straightforward interaction can lead to different outcomes: + + A truthful AI might candidly disclose a product's shortcomings, even if that means failing to make a sale. + A helpful AI (from the seller's perspective) might engage in deceptive practices, such as obfuscation or even outright lying, to convince you to buy the product. +
    +{/* + 1. A truthful AI might candidly disclose a product's shortcomings, even if that means failing to make a sale.
    + 2. A helpful AI (from the seller's perspective) might engage in deceptive practices, such as obfuscation or even outright lying, to convince you to buy the product. */} + This scenario highlights a crucial dilemma: How do AI agents handle the tricky trade-off between utility and being truthful? And we believe understanding how AI navigates these complex trade-offs is crucial for developing more reliable and ethically-aligned systems. +
    +
    + + + Why Understanding Instruction is Hard + + In real-world scenarios, humans often provide ambiguous or under-specified instructions. We naturally rely on common sense and prior knowledge to fill in the gaps and navigate complex situations. However, AI agents lack this intuitive understanding. + Similarly, humans have implicit expectations about truthfulness and social norms that we don't always explicitly state. An AI focused solely on achieving a given goal might not consider these unspoken rules, potentially leading to unintended behaviors. + The situation is further complicated by the question of whose instructions the AI should prioritize and under what conditions. This ambiguity can lead to scenarios where an AI's interpretation of "helpful" might not align with ethical standards or user expectations. + + + + + Enter AI-LieDar + + To dig into this, we've created AI-LieDar. It's a tool that presents LLMs with carefully crafted scenarios to see how they handle these truth-vs-helpfulness dilemmas. + Our goal? To understand how AI makes these tough calls. This could help us build AI systems that are not just smart, but also ethical and reliable. + By exploring this balance between being useful and being truthful, we're tackling a key question in AI ethics. It's not just about making AI work well – it's about making it work right. + + {/*
    + AI-LieDar concept +
    */} +
    + + + AI-LieDar Framework + AI-LieDar Framework Illustration + Scenario Construction + + The heart of our study lies in the carefully crafted scenarios we developed. Our goal was to create situations that would challenge AI models, forcing them to balance helpfulness and truthfulness. The figure above shows how we do it:
    + Creating Conflict: The core of each scenario is a tension between: + + Utility: What the AI needs to achieve + Negative Information: Facts that might hinder that goal if revealed + + {/* 1. Utility: What the AI needs to achieve
    + 2. Negative Information: Facts that might hinder that goal if revealed */} + We also included instructions on why the AI should or shouldn't be truthful, adding depth to the ethical dilemma.

    + Ensuring Diversity: To cover a wide range of situations, we categorized our scenarios based on: + + Who benefits from potential deception: does the act mainly benefit liar (self-oriented)? Or benefits others (other-oriented) + Why people lie: Is it for getting various benefits (time, money), maintaining self-esteem (on personal taste, competence), or other motivations + + Based on the first categorization, our main categories were: Benefits, Public Image, and Emotion + We further diversified scenarios within these categories, drawing inspiration from various human motivations for lying. In total we get 20 scenarios for each category, and you can see one example in the above figure. +
    +
    + + + Evaluator Construction + + In real life, honesty isn't always black and white. We wanted our study to reflect this complexity. Drawing from psychological research, we developed a more nuanced way to evaluate AI truthfulness: + + Fully Truthful: Complete honesty + Partial Lie: Omitting details or being vague + Falsification: Outright lying +
    + {/* 1. Fully Truthful: Complete honesty
    + 2. Partial Lie: Omitting details or being vague
    + 3. Falsification: Outright lying */} + To assess thousands of responses, we used GPT-4o to be our "truth detector." We gave it clear definitions and examples of each category. + The good news? Our AI truth detector agreed with human judges about 85% of the time. This gave us confidence in using it for our larger study. +
    +
    + + Results + Main Results of truthfulness for different models + Main Results: The Truth About AI Honesty + {/*
    + AI Honesty Results +
    */} + + Our study revealed some interesting patterns in how AI models handle truthfulness: + + + Honesty Isn't the Default
    + Surprisingly, we found that AI models often resort to deception, being truthful 50% of the time or less. Different models showed different tendencies: + + GPT-4 and Mixtral-7*22B were the most honest. + GPT-3.5-Turbo was more likely to tell outright falsehoods. + LLaMA models were the most prone to various deceptive behaviors, including partial lies. + + {/* 1. GPT-4 and Mixtral-7*22B were the most honest.
    + 2. GPT-3.5-Turbo was more likely to tell outright falsehoods.
    + 3. LLaMA models were the most prone to various deceptive behaviors, including partial lies. */} + The Grey Area
    + Many models tried to balance being helpful and truthful by giving ambiguous or vague answers. It seems even AI knows how to sidestep a tricky situation!
    + Goal Achievement
    + Despite their tendency to be less than fully truthful, most models performed well in achieving their given goals. The exception was the smaller LLaMA models (3-8B), which struggled more. +
    +
    + + + How Different Information Components Work + + We dug deeper to see what factors influence an AI's truthfulness: + + Hidden Information: Giving the AI "secret" information made it less truthful. + Reasons Not to Lie: Providing motivations for honesty increased truthfulness. + Removing Temptation: Surprisingly, taking away reasons to lie boosted truthfulness by about 40%. +
    + {/* 1. Hidden Information: Giving the AI "secret" information made it less truthful.
    + 2. Reasons Not to Lie: Providing motivations for honesty increased truthfulness.
    + 3. Removing Temptation: Surprisingly, taking away reasons to lie boosted truthfulness by about 40%. */} + Importantly, we found that considering partial lies is crucial when evaluating AI truthfulness. +
    + Ablation Study on Information Components +
    + + + Models Can Be Steered + + We discovered that we can influence an AI's honesty through instructions: + + + Flexibility: Both truthfulness and deception rates changed significantly with guidance.
    + Stronger Models, Bigger Changes: More advanced models like GPT-4 showed larger shifts in behavior.
    + Honesty vs. Goals: In scenarios with clear, objective goals, being more truthful often meant being less effective at achieving the goal. This trade-off was less noticeable in more subjective scenarios.
    + Interestingly, even when explicitly instructed to be honest, models still occasionally lied. +
    + Ablation Study on Steering +
    + + + Implications + + Our research into AI truthfulness and helpfulness reveals several important implications: + + Steerability: Flexible but Risky + + We found that AI models can be steered towards or away from truthfulness. This flexibility is a double-edged sword. While it allows for customization, it also raises concerns about potential misuse. How can we ensure AI systems remain truthful when it matters most? + + The Cost of Honesty + + Being truthful isn't always free. Our experiments showed that in some scenarios, prioritizing honesty can reduce an AI's ability to achieve certain goals. This trade-off raises an important question: How do we balance truthfulness with effectiveness in AI systems? + + The Dilemma of Deception + + Interestingly, we observed that AI models often try to avoid the truthfulness-helpfulness conflict by giving vague responses. However, this approach isn't always helpful. Sometimes, a clear but false statement might provide more useful information than an evasive truth. + This leads us to a challenging question: When, if ever, is it okay for AI to be strategically deceptive? And who gets to make that decision - individuals, AI companies, or governments? + + + + + Final Words + + Our study reveals that AI language models, like humans, face complex decisions when balancing honesty and achieving goals. We found that different models have varying tendencies to lie, but prioritize utility over complete truthfulness. Interestingly, while we can influence models to be more truthful, there's always a risk they might not be entirely honest. + + + This research is just the beginning. As we continue to develop more advanced AI, it's crucial that we keep exploring these ethical dilemmas. Our goal is to create AI systems that are both useful and trustworthy - a balance that will be essential as AI becomes increasingly integrated into our daily lives. + + +
    +
    +
    + ); +} \ No newline at end of file diff --git a/app/projects/sotopia-pi/page.tsx b/app/projects/sotopia-pi/page.tsx index dd0a6a8..93b5bbd 100644 --- a/app/projects/sotopia-pi/page.tsx +++ b/app/projects/sotopia-pi/page.tsx @@ -23,19 +23,6 @@ import { Abhaya_Libre } from "next/font/google"; import { Detail } from "@/components/ListDetail/Detail"; import React from "react"; -import { - NavigationMenu, - NavigationMenuContent, - NavigationMenuIndicator, - NavigationMenuItem, - NavigationMenuLink, - NavigationMenuList, - NavigationMenuTrigger, - NavigationMenuViewport, - navigationMenuTriggerStyle, -} from "@/components/ui/navigation-menu" - - function SectionContent(props: React.JSX.IntrinsicAttributes & React.ClassAttributes & React.HTMLAttributes) { return
    } diff --git a/components/navigation.tsx b/components/navigation.tsx index 18a9fb2..13f4cb1 100644 --- a/components/navigation.tsx +++ b/components/navigation.tsx @@ -57,6 +57,11 @@ const project_constants = [ href: "/projects/agent_vs_script", description: "Realistic social simulations require information asymmetry.", }, + { + title: "AI-Liedar", + href: "/projects/ai_liedar", + description: "To be truthful or to be helpful?", + }, ] import Link, { LinkProps } from "next/link" diff --git a/components/ui/image-text-overlay.tsx b/components/ui/image-text-overlay.tsx index 2a30fb9..befb96c 100644 --- a/components/ui/image-text-overlay.tsx +++ b/components/ui/image-text-overlay.tsx @@ -10,11 +10,13 @@ interface ImageWithTextOverlayProps { const ImageWithTextOverlay: React.FC = ({ src, alt, title }) => { return ( -
    +
    {/* 9:16 Aspect Ratio */} {alt}