diff --git a/pkg/templates/python/cua/providers/gemini.py b/pkg/templates/python/cua/providers/gemini.py index 0068a3b..57aa52c 100644 --- a/pkg/templates/python/cua/providers/gemini.py +++ b/pkg/templates/python/cua/providers/gemini.py @@ -210,8 +210,10 @@ async def _execute_action( await asyncio.sleep(1.5) elif name == "key_combination": - combo = args.get("key_combination", "") - parts = [k.strip() for k in combo.split("+")] + # Gemini sends the combo as a single "+"-joined string in `keys`. + if "keys" not in args: + return {"error": "key_combination requires keys"} + parts = [k.strip() for k in str(args["keys"]).split("+")] hold_keys = parts[:-1] if len(parts) > 1 else [] keys = parts[-1:] if parts else [] kwargs: dict = {"keys": keys or parts} @@ -222,10 +224,11 @@ async def _execute_action( ) elif name == "drag_and_drop": - sx = self._denorm(args.get("start_x"), width) - sy = self._denorm(args.get("start_y"), height) - ex = self._denorm(args.get("end_x"), width) - ey = self._denorm(args.get("end_y"), height) + # Gemini's drag schema uses x/y for the start and destination_x/destination_y for the end. + sx = self._denorm(args.get("x"), width) + sy = self._denorm(args.get("y"), height) + ex = self._denorm(args.get("destination_x"), width) + ey = self._denorm(args.get("destination_y"), height) await asyncio.to_thread( computer.drag_mouse, options.session_id, path=[[sx, sy], [ex, ey]], ) diff --git a/pkg/templates/python/cua/session.py b/pkg/templates/python/cua/session.py index 3cbe254..9aadbf7 100644 --- a/pkg/templates/python/cua/session.py +++ b/pkg/templates/python/cua/session.py @@ -114,6 +114,7 @@ async def stop(self) -> SessionInfo: info = self.info if self._session_id: + session_id = self._session_id try: if self.opts.record_replay and self._replay_id: if self.opts.replay_grace_period > 0: @@ -121,16 +122,18 @@ async def stop(self) -> SessionInfo: await self._stop_replay() info.replay_view_url = self._replay_view_url finally: - print(f"Destroying browser session: {self._session_id}") + # Reset state up front so that if browser deletion or a thrown replay + # error propagates, a follow-up stop() call from the caller's error path + # is a no-op instead of attempting to delete the same session twice. + self._session_id = None + self._live_view_url = None + self._replay_id = None + self._replay_view_url = None + print(f"Destroying browser session: {session_id}") await asyncio.to_thread( - self.kernel.browsers.delete_by_id, self._session_id, + self.kernel.browsers.delete_by_id, session_id, ) - self._session_id = None - self._live_view_url = None - self._replay_id = None - self._replay_view_url = None - return info async def _stop_replay(self) -> None: diff --git a/pkg/templates/typescript/cua/providers/gemini.ts b/pkg/templates/typescript/cua/providers/gemini.ts index 918987a..0ecbb49 100644 --- a/pkg/templates/typescript/cua/providers/gemini.ts +++ b/pkg/templates/typescript/cua/providers/gemini.ts @@ -48,13 +48,11 @@ interface GeminiArgs { y?: number; text?: string; url?: string; - key_combination?: string; + keys?: string; direction?: string; magnitude?: number; - start_x?: number; - start_y?: number; - end_x?: number; - end_y?: number; + destination_x?: number; + destination_y?: number; safety_decision?: { decision: string; explanation?: string }; [key: string]: unknown; } @@ -221,8 +219,9 @@ export class GeminiProvider implements CuaProvider { break; } case 'key_combination': { - const combo = args.key_combination ?? ''; - const parts = combo.split('+').map(k => k.trim()); + // Gemini sends the combo as a single "+"-joined string in `keys`. + if (!args.keys) return { error: 'key_combination requires keys' }; + const parts = args.keys.split('+').map(k => k.trim()); const holdKeys = parts.slice(0, -1); const keys = parts.slice(-1); await computer.pressKey(sessionId, { @@ -232,10 +231,11 @@ export class GeminiProvider implements CuaProvider { break; } case 'drag_and_drop': { - const sx = this.denormalize(args.start_x, width); - const sy = this.denormalize(args.start_y, height); - const ex = this.denormalize(args.end_x, width); - const ey = this.denormalize(args.end_y, height); + // Gemini's drag schema uses x/y for the start and destination_x/destination_y for the end. + const sx = this.denormalize(args.x, width); + const sy = this.denormalize(args.y, height); + const ex = this.denormalize(args.destination_x, width); + const ey = this.denormalize(args.destination_y, height); await computer.dragMouse(sessionId, { path: [[sx, sy], [ex, ey]] }); break; } diff --git a/pkg/templates/typescript/cua/session.ts b/pkg/templates/typescript/cua/session.ts index 8492238..8b34b60 100644 --- a/pkg/templates/typescript/cua/session.ts +++ b/pkg/templates/typescript/cua/session.ts @@ -106,6 +106,7 @@ export class KernelBrowserSession { const info = this.info; if (this._sessionId) { + const sessionId = this._sessionId; try { if (this.opts.recordReplay && this._replayId) { if (this.opts.replayGracePeriod > 0) { @@ -115,16 +116,18 @@ export class KernelBrowserSession { info.replayViewUrl = this._replayViewUrl || undefined; } } finally { - console.log(`Destroying browser session: ${this._sessionId}`); - await this.kernel.browsers.deleteByID(this._sessionId); + // Reset state up front so that if browser deletion or a thrown replay error + // propagates, a follow-up stop() call from the caller's error path is a no-op + // instead of attempting to delete the same session twice. + this._sessionId = null; + this._liveViewUrl = null; + this._replayId = null; + this._replayViewUrl = null; + console.log(`Destroying browser session: ${sessionId}`); + await this.kernel.browsers.deleteByID(sessionId); } } - this._sessionId = null; - this._liveViewUrl = null; - this._replayId = null; - this._replayViewUrl = null; - return info; }