1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
| def chat_with_workflow( self, chat: List[Message], self_reflection: bool = False, display_visualization: bool = False, ) -> Dict[str, Any]: """Chat with Vision Agent and return intermediate information regarding the task.
Parameters: chat (List[MediaChatItem]): A conversation in the format of: [{"role": "user", "content": "describe your task here..."}] or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] self_reflection (bool): Whether to reflect on the task and debug the code. display_visualization (bool): If True, it opens a new window locally to show the image(s) created by visualization code (if there is any).
Returns: Dict[str, Any]: A dictionary containing the code, test, test result, plan, and working memory of the agent. """
if not chat: raise ValueError("Chat cannot be empty.")
with CodeInterpreterFactory.new_instance() as code_interpreter: chat = copy.deepcopy(chat) media_list = [] for chat_i in chat: if "media" in chat_i: for media in chat_i["media"]: media = code_interpreter.upload_file(media) chat_i["content"] += f" Media name {media}" media_list.append(media)
int_chat = cast( List[Message], [{"role": c["role"], "content": c["content"]} for c in chat], )
code = "" test = "" working_memory: List[Dict[str, str]] = [] results = {"code": "", "test": "", "plan": []} plan = [] success = False retries = 0
while not success and retries < self.max_retries: self.log_progress( { "type": "plans", "status": "started", } ) plan_i = write_plan( int_chat, T.TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner, ) plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
self.log_progress( { "type": "plans", "status": "completed", "payload": plan_i, } )
if self.verbosity >= 1: _LOGGER.info( f"\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}" )
tool_info = retrieve_tools( plan_i, self.tool_recommender, self.log_progress, self.verbosity, )
results = write_and_test_code( chat=int_chat, tool_info=tool_info, tool_utils=T.UTILITIES_DOCSTRING, working_memory=working_memory, coder=self.coder, tester=self.tester, debugger=self.debugger, code_interpreter=code_interpreter, log_progress=self.log_progress, verbosity=self.verbosity, media=media_list, )
success = cast(bool, results["success"]) code = cast(str, results["code"]) test = cast(str, results["test"]) working_memory.extend(results["working_memory"]) plan.append({"code": code, "test": test, "plan": plan_i})
if not self_reflection: break
reflection = reflect( int_chat, FULL_TASK.format( user_request=chat[0]["content"], subtasks=plan_i_str ), code, self.planner, )
execution_result = cast(Execution, results["test_result"])
if display_visualization: for res in execution_result.results: if res.png: b64_to_pil(res.png).show() if res.mp4: play_video(res.mp4)
return { "code": DefaultImports.prepend_imports(code), "test": test, "test_result": execution_result, "plan": plan, "working_memory": working_memory, }
|