Skip to content

Commit a53802e

Browse files
committed
Improve citation handling and link sanitization
Enhances citation mapping and normalization to avoid duplicate system messages and properly mask code segments. Refines URL extraction and normalization, improves link sanitization in markdown rendering, and updates citation rendering logic to use safe href attributes. Also fixes message content updates to preserve initial content during streaming.
1 parent 5f7e330 commit a53802e

File tree

4 files changed

+164
-51
lines changed

4 files changed

+164
-51
lines changed

src/lib/components/chat/MarkdownRenderer.svelte.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ describe("MarkdownRenderer", () => {
3636
{
3737
title: "foo",
3838
link: "https://example.com",
39+
index: 1,
3940
},
4041
],
4142
};
@@ -54,14 +55,17 @@ describe("MarkdownRenderer", () => {
5455
{
5556
title: "foo",
5657
link: "https://foo.com",
58+
index: 1,
5759
},
5860
{
5961
title: "bar",
6062
link: "https://bar.com",
63+
index: 2,
6164
},
6265
{
6366
title: "baz",
6467
link: "https://baz.com",
68+
index: 3,
6569
},
6670
],
6771
});
@@ -77,6 +81,7 @@ describe("MarkdownRenderer", () => {
7781
{
7882
title: "foo",
7983
link: "https://example.com",
84+
index: 1,
8085
},
8186
],
8287
});

src/lib/server/textGeneration/generate.ts

Lines changed: 69 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,24 @@ type ToolRun = {
5656
output: string;
5757
};
5858

59-
const URL_REGEX = /https?:\/\/[^\s)\]}"'>]+/g;
59+
const URL_REGEX = /https?:\/\/[^\s<]+/g;
6060

6161
function normalizeUrl(raw: string): string {
62-
return raw.replace(/[)\].,;:"'>]+$/g, "");
62+
const canParse = (value: string) => {
63+
try {
64+
const parsed = new URL(value);
65+
return parsed.protocol === "http:" || parsed.protocol === "https:";
66+
} catch {
67+
return false;
68+
}
69+
};
70+
71+
if (canParse(raw)) {
72+
return raw;
73+
}
74+
75+
const trimmed = raw.replace(/[)\].,;:"'>]+$/g, "");
76+
return canParse(trimmed) ? trimmed : raw;
6377
}
6478

6579
function buildToolPreprompt(tools: OpenAiTool[]): string {
@@ -286,6 +300,7 @@ async function* runMcpFlow({
286300
const citationSources: MessageSource[] = [];
287301
const citationIndex = new Map<string, number>();
288302
let lastMappingCount = 0;
303+
let mappingMessageIndex: number | null = null;
289304

290305
const buildCitationMappingMessage = (): string | null => {
291306
if (citationSources.length === 0) {
@@ -314,7 +329,20 @@ Reference only these indices (e.g., [1]) and reuse numbers for repeat URLs.`;
314329
if (!mappingMessage) {
315330
return;
316331
}
332+
if (mappingMessageIndex !== null && mappingMessageIndex >= 0) {
333+
if (mappingMessageIndex < messagesOpenAI.length) {
334+
const existing = messagesOpenAI[mappingMessageIndex];
335+
if (existing?.role === "system") {
336+
messagesOpenAI = [
337+
...messagesOpenAI.slice(0, mappingMessageIndex),
338+
...messagesOpenAI.slice(mappingMessageIndex + 1),
339+
];
340+
}
341+
}
342+
mappingMessageIndex = null;
343+
}
317344
messagesOpenAI = [...messagesOpenAI, { role: "system", content: mappingMessage }];
345+
mappingMessageIndex = messagesOpenAI.length - 1;
318346
lastMappingCount = citationSources.length;
319347
};
320348

@@ -359,26 +387,27 @@ Reference only these indices (e.g., [1]) and reuse numbers for repeat URLs.`;
359387

360388
const stripTrailingSourcesBlock = (input: string): string => {
361389
const lines = input.split("\n");
362-
let start = -1;
390+
let headerIndex = -1;
363391
for (let i = lines.length - 1; i >= 0; i -= 1) {
364392
const trimmed = lines[i].trim();
365393
if (!trimmed) continue;
366-
if (/^sources?:\s*$/i.test(trimmed)) {
367-
start = i;
368-
break;
369-
}
370394
if (
395+
/^sources?:\s*$/i.test(trimmed) ||
371396
/^sources?:\s*(?:\[[\d]+\]\([^)]*\)|\(?\s*\d+\)?|https?:\/\/\S+)(?:\s*,\s*(?:\[[\d]+\]\([^)]*\)|\(?\s*\d+\)?|https?:\/\/\S+))*$/i.test(
372397
trimmed
373398
)
374399
) {
375-
start = i;
400+
headerIndex = i;
376401
break;
377402
}
378-
return input;
403+
if (
404+
!/^[-*]?\s*(?:\(?\s*\d+\)?\.?\s*)?(https?:\/\/\S+|\[[\d]+\]\([^)]*\))\s*$/i.test(trimmed)
405+
) {
406+
return input;
407+
}
379408
}
380-
if (start === -1) return input;
381-
for (let j = start + 1; j < lines.length; j += 1) {
409+
if (headerIndex === -1) return input;
410+
for (let j = headerIndex + 1; j < lines.length; j += 1) {
382411
const trimmed = lines[j].trim();
383412
if (!trimmed) continue;
384413
if (
@@ -387,7 +416,7 @@ Reference only these indices (e.g., [1]) and reuse numbers for repeat URLs.`;
387416
return input;
388417
}
389418
}
390-
return lines.slice(0, start).join("\n").replace(/\s+$/, "");
419+
return lines.slice(0, headerIndex).join("\n").replace(/\s+$/, "");
391420
};
392421

393422
const appendMissingCitations = (text: string): string => {
@@ -425,7 +454,32 @@ Reference only these indices (e.g., [1]) and reuse numbers for repeat URLs.`;
425454
const normalizeCitations = (
426455
text: string
427456
): { normalizedText: string; normalizedSources: MessageSource[] } => {
428-
const indices = extractUsedSourceIndexes(text);
457+
const maskCodeSegments = (value: string) => {
458+
const placeholders: string[] = [];
459+
const token = (index: number) => `\uE000${index}\uE001`;
460+
const stash = (match: string) => {
461+
const placeholder = token(placeholders.length);
462+
placeholders.push(match);
463+
return placeholder;
464+
};
465+
466+
let masked = value.replace(/```[\s\S]*?```/g, stash);
467+
masked = masked.replace(/~~~[\s\S]*?~~~/g, stash);
468+
masked = masked.replace(/`[^`]*`/g, stash);
469+
470+
const unmask = (input: string) =>
471+
input.replace(/\uE000(\d+)\uE001/g, (_match, idx) => placeholders[Number(idx)] ?? "");
472+
473+
return { masked, unmask };
474+
};
475+
476+
if (citationSources.length === 0) {
477+
return { normalizedText: text, normalizedSources: [] };
478+
}
479+
480+
const { masked, unmask } = maskCodeSegments(text);
481+
482+
const indices = extractUsedSourceIndexes(masked);
429483
if (indices.length === 0) {
430484
return { normalizedText: text, normalizedSources: [] };
431485
}
@@ -435,7 +489,7 @@ Reference only these indices (e.g., [1]) and reuse numbers for repeat URLs.`;
435489
mapping.set(oldIndex, position + 1);
436490
});
437491

438-
const normalizedText = text.replace(
492+
const normalizedMaskedText = masked.replace(
439493
/\[(\d+(?:\s*,\s*\d+)*)\]/g,
440494
(match: string, group: string) => {
441495
const parts = group.split(/\s*,\s*/);
@@ -465,7 +519,7 @@ Reference only these indices (e.g., [1]) and reuse numbers for repeat URLs.`;
465519
.filter((source): source is MessageSource => Boolean(source))
466520
.sort((a, b) => a.index - b.index);
467521

468-
return { normalizedText, normalizedSources };
522+
return { normalizedText: unmask(normalizedMaskedText), normalizedSources };
469523
};
470524

471525
let lastAssistantContent = "";

src/lib/utils/marked.ts

Lines changed: 86 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import type { Tokens, TokenizerExtension, RendererExtension } from "marked";
66
type SimpleSource = {
77
title?: string;
88
link: string;
9+
index?: number;
910
};
1011
import hljs from "highlight.js";
1112

@@ -147,60 +148,112 @@ function escapeHTML(content: string) {
147148
);
148149
}
149150

151+
const ALLOWED_PROTOCOLS = new Set(["http:", "https:", "mailto:", "tel:"]);
152+
153+
function escapeAttribute(value: string): string {
154+
return value
155+
.replace(/&/g, "&amp;")
156+
.replace(/"/g, "&quot;")
157+
.replace(/'/g, "&#39;")
158+
.replace(/</g, "&lt;")
159+
.replace(/>/g, "&gt;");
160+
}
161+
162+
function sanitizeHrefAttribute(
163+
href: string | undefined,
164+
{ allowRelative = false }: { allowRelative?: boolean } = {}
165+
): string {
166+
if (!href) return "";
167+
const trimmed = href.replace(/>$/, "");
168+
169+
const hasScheme = /^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(trimmed);
170+
if (hasScheme) {
171+
try {
172+
const parsed = new URL(trimmed);
173+
if (!ALLOWED_PROTOCOLS.has(parsed.protocol)) {
174+
return "";
175+
}
176+
return escapeAttribute(trimmed);
177+
} catch {
178+
return "";
179+
}
180+
}
181+
182+
if (!allowRelative) {
183+
return "";
184+
}
185+
186+
if (/^(?:#|\/|\.\/|\.\.\/)/.test(trimmed)) {
187+
return escapeAttribute(trimmed);
188+
}
189+
190+
return "";
191+
}
192+
150193
function transformOutsideHtmlCode(html: string, transform: (segment: string) => string): string {
151194
const parts = html.split(/(<pre[\s\S]*?<\/pre>|<code[\s\S]*?<\/code>)/gi);
152-
return parts
153-
.map((part) => (/^<pre|^<code/i.test(part) ? part : transform(part)))
154-
.join("");
195+
return parts.map((part) => (/^<pre|^<code/i.test(part) ? part : transform(part))).join("");
155196
}
156197

157198
function addInlineCitations(html: string, webSearchSources: SimpleSource[] = []): string {
158199
const linkStyle = "color: rgb(59, 130, 246); text-decoration: none;";
200+
const indexMap = new Map<number, SimpleSource>();
201+
webSearchSources.forEach((source, position) => {
202+
const resolvedIndex = Number.isFinite(source.index) ? Number(source.index) : position + 1;
203+
if (resolvedIndex > 0 && !indexMap.has(resolvedIndex)) {
204+
indexMap.set(resolvedIndex, source);
205+
}
206+
});
207+
159208
const applyReplacements = (value: string) =>
160209
value
161-
.replace(/\[(\d+)\](?!\()/g, (match: string) => {
162-
const indices: number[] = (match.match(/\d+/g) || []).map(Number);
163-
const links: string = indices
164-
.map((index: number) => {
165-
if (index === 0) return "";
166-
const source = webSearchSources[index - 1];
167-
if (!source) return "";
168-
return `<a href="${source.link}" target="_blank" rel="noopener noreferrer" style="${linkStyle}">${index}</a>`;
169-
})
170-
.filter(Boolean)
171-
.join(", ");
172-
return links ? ` <sup>${links}</sup>` : match;
210+
.replace(/\[(\d+)\](?!\()/g, (match: string, rawIndex: string) => {
211+
const index = Number(rawIndex);
212+
const source = indexMap.get(index);
213+
if (!source) {
214+
return match;
215+
}
216+
const safeHref = sanitizeHrefAttribute(source.link, { allowRelative: false });
217+
return safeHref
218+
? ` <sup><a href="${safeHref}" target="_blank" rel="noopener noreferrer" style="${linkStyle}">${index}</a></sup>`
219+
: match;
173220
})
174221
.replace(/\((\d+\s*(?:,\s*\d+)+)\)/g, (match: string, group: string) => {
175-
const indices = group
222+
const linked = group
176223
.split(/\s*,\s*/)
177-
.map((value) => Number(value.trim()))
178-
.filter((value) => Number.isFinite(value) && value > 0);
179-
if (indices.length === 0) return match;
180-
const links = indices
181-
.map((index: number) => {
182-
const source = webSearchSources[index - 1];
183-
if (!source) return "";
184-
return `<a href="${source.link}" target="_blank" rel="noopener noreferrer" style="${linkStyle}">${index}</a>`;
224+
.map((token) => {
225+
const index = Number(token);
226+
const source = indexMap.get(index);
227+
if (!source) {
228+
return "";
229+
}
230+
const safeHref = sanitizeHrefAttribute(source.link, { allowRelative: false });
231+
return safeHref
232+
? `<a href="${safeHref}" target="_blank" rel="noopener noreferrer" style="${linkStyle}">${index}</a>`
233+
: "";
185234
})
186235
.filter(Boolean)
187236
.join(", ");
188-
return links ? ` (<sup>${links}</sup>)` : match;
237+
return linked ? ` (<sup>${linked}</sup>)` : match;
189238
});
190239

191-
const decorate = (segment: string) => applyReplacements(segment);
192-
return transformOutsideHtmlCode(html, decorate);
240+
return transformOutsideHtmlCode(html, applyReplacements);
193241
}
194242

195243
function createMarkedInstance(sources: SimpleSource[]): Marked {
196244
return new Marked({
197-
hooks: {
198-
postprocess: (html) => addInlineCitations(html, sources),
199-
},
245+
hooks: {
246+
postprocess: (html) => addInlineCitations(html, sources),
247+
},
200248
extensions: [katexBlockExtension, katexInlineExtension],
201-
renderer: {
202-
link: (href, title, text) =>
203-
`<a href="${href?.replace(/>$/, "")}" target="_blank" rel="noopener noreferrer">${text}</a>`,
249+
renderer: {
250+
link: (href, _title, text) => {
251+
const safeHref = sanitizeHrefAttribute(href, { allowRelative: true });
252+
const safeText = escapeHTML(text ?? "");
253+
return safeHref
254+
? `<a href="${safeHref}" target="_blank" rel="noopener noreferrer">${safeText}</a>`
255+
: safeText;
256+
},
204257
html: (html) => escapeHTML(html),
205258
},
206259
gfm: true,

src/routes/conversation/[id]/+page.svelte

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@
247247
if (!messageToWriteTo) {
248248
throw new Error("Message to write to not found");
249249
}
250+
const initialMessageContent = messageToWriteTo.content ?? "";
250251
251252
const messageUpdatesAbortController = new AbortController();
252253
@@ -332,9 +333,9 @@
332333
messageToWriteTo.content += buffer;
333334
buffer = "";
334335
}
335-
if (update.text) {
336-
messageToWriteTo.content = update.text;
337-
}
336+
const finalText =
337+
update.text ?? messageToWriteTo.content.slice(initialMessageContent.length);
338+
messageToWriteTo.content = initialMessageContent + finalText;
338339
messageToWriteTo.interrupted = update.interrupted;
339340
if (update.sources && update.sources.length > 0) {
340341
messageToWriteTo.sources = update.sources;

0 commit comments

Comments
 (0)