diff --git a/angel_system/data/common/bounding_boxes.py b/angel_system/data/common/bounding_boxes.py new file mode 100644 index 000000000..6811580e0 --- /dev/null +++ b/angel_system/data/common/bounding_boxes.py @@ -0,0 +1,23 @@ +from typing import * + + +class BoundingBoxes: + def __init__( + self, + left: List[int], + right: List[int], + top: List[int], + bottom: List[int], + item: List[Any], + ): + """ + Wrapper of bounding boxes and a contained entity corresponding to each bounding box. + The item is intentionally kept ambiguous to provide flexibility (e.g. can pass in + an object label that corresponds to each bounding box or a tuple of an object label and + its confidence score). + """ + self.left = left + self.right = right + self.top = top + self.bottom = bottom + self.item = item diff --git a/angel_system/global_step_prediction/global_step_predictor.py b/angel_system/global_step_prediction/global_step_predictor.py index 824f66953..5cb2d1fe0 100644 --- a/angel_system/global_step_prediction/global_step_predictor.py +++ b/angel_system/global_step_prediction/global_step_predictor.py @@ -684,8 +684,6 @@ def conditionally_reset_irrational_trackers(self, tracker, skip=False): ): print("reset condition hit!!") # import ipdb; ipdb.set_trace() - if tracker["recipe"] == "coffee": - print(f"tea step = {self.trackers[1]['current_granular_step']}") for tracker_ind in self.find_trackers_by_recipe( resetter_granular_step[recipe][1] ): @@ -696,10 +694,6 @@ def conditionally_reset_irrational_trackers(self, tracker, skip=False): ][0] ): self.reset_one_tracker(tracker_ind) - if tracker["recipe"] == "coffee": - print( - f"tea step after = {self.trackers[1]['current_granular_step']}" - ) else: for recipe in resetter_granular_step: granular_steps = [ diff --git a/angel_system/utils/object_detection_queues/__init__.py b/angel_system/utils/object_detection_queues/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py new file mode 100644 index 000000000..ba4944435 --- /dev/null +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py @@ -0,0 +1,118 @@ +import logging +import heapq +from scipy.spatial import distance +import threading +from typing import * + +from angel_system.data.common.bounding_boxes import BoundingBoxes + +LOG = logging.getLogger(__name__) + + +class Centroid2DStrategyQueue: + """ + Little class to handle priority queueing of detected object bounding boxes + based on their centroid (center coordinate of the bounding box). + Items are stored in a priority queue based on a timestamp integer. + When items are popped from the queue, the `last_n` items *before* a provided + timestamp are returned. + + + Typical Example Usage: + q = Centroid2DStrategyQueue(n=1, k=2) + q.add(timestamp=1, BoundingBoxes(..., [('obj1', 'obj2', 'obj3')])) + q.add(timestamp=2, BoundingBoxes(..., [('obj1', 'obj2', 'obj3')])) + q.get_n_before(2) + """ + + def __init__( + self, + n: int, + center_x: int, + center_y: int, + k: int = 1, + log_func: Optional[Callable[..., None]] = None, + ): + """ + Additional arguments are passed to the logging method + :param n: Whenever objects are retrieved, return the last n entries. + :param k: Acquires the top k objects that are the most centered given their centroid. + :param log_func: Optional callable to be invoked to receive the + message. If this is `None`, the local Logger instance to this + module is used. + """ + self._log_func = log_func + + self.n = n + self.k = k + + # This is the main priority queue. Each item should be a Tuple[int, Any] in which + # the elements correspond to (Integer Timestamp, Any Object). An example of the queued + # object's second element could be a Tuple of the top K detected objects. + self.pq = [] + self.center_x = center_x + self.center_y = center_y + self.lock = threading.Lock() + + def get_queue(self): + return self.pq + + def add(self, timestamp: int, bounding_boxed_item: BoundingBoxes): + self.lock.acquire() + k_most_centered_objects = self._get_k_most_center_objects(bounding_boxed_item) + heapq.heappush(self.pq, (timestamp, k_most_centered_objects)) + self.lock.release() + + def get_n_before(self, timestamp: int) -> List[Any]: + """ + Gets the self.n items before the provided timestamp. + """ + items = [] + self.lock.acquire() + while self.pq: + next_timestamp, _ = self.pq[0] + if next_timestamp < timestamp: + items.append(heapq.heappop(self.pq)) + else: + break + self.lock.release() + if self._log_func: + self._log_func( + f"Read up to {self.n} items from queue" + + "; ".join([f"{item} @ Time={time}" for time, item in items]) + ) + return items[-self.n :] if items else items + + def _get_k_most_center_objects(self, bb: BoundingBoxes) -> List[Any]: + """ + Acquires the top k objects with respect to centroid distance from the center pixel. + Returns a list of Tuples of (centroid distance, top k most centered objects) + """ + k_most_centered_objects = [] + + # Sort the bounding boxes in order of distance from centroid to center pixel. + zipped = zip(bb.item, bb.left, bb.right, bb.top, bb.bottom) + for item, left, right, top, bottom in zipped: + centroid_x, centroid_y = self._get_centroid(left, right, top, bottom) + dist = distance.euclidean( + [centroid_x, centroid_y], [self.center_x, self.center_y] + ) + heapq.heappush(k_most_centered_objects, (dist, item)) + + # Return the top k centered objects based on centroid distance. + result = [] + for _ in range(self.k): + if not k_most_centered_objects: + break + result.append(heapq.heappop(k_most_centered_objects)) + return result + + def _get_centroid( + self, left: int, right: int, top: int, bottom: int + ) -> Tuple[int, int]: + """ + Calculates the center 2D pixel of a 2D bounding box. + """ + width_center = left + int((right - left) / 2) + height_center = top + int((bottom - top) / 2) + return [width_center, height_center] diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py new file mode 100644 index 000000000..36dbf4b4e --- /dev/null +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py @@ -0,0 +1,316 @@ +import unittest + +from centroid_2d_strategy_queue import Centroid2DStrategyQueue +from angel_system.data.common.bounding_boxes import BoundingBoxes + + +RESOLUTION_W = 1920 +RESOLUTION_H = 1080 + + +class Centroid2DStrategyQueueTest(unittest.TestCase): + def test_queue_n3_k1_insertion(self): + """ + Tests proper queue insertion when objects are inserted as strings. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2 + ) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + ["mug", "computer", "dog"], + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + ["ball", "butterfly", "cat"], + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4, 2], + ["shoes", "pencil", "child"], + ) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) + + queue_state = q.get_queue() + first_timestamped_item, second_timetsamped_item, third_timestamped_item = ( + queue_state[0], + queue_state[1], + queue_state[2], + ) + first_top_k, second_top_k, third_top_k = ( + first_timestamped_item[-1], + second_timetsamped_item[-1], + third_timestamped_item[-1], + ) + # Recall that each object is a List of Tuples of (centroid distance, detected object) + self.assertEqual(first_top_k[0][-1], "dog") + self.assertEqual(second_top_k[0][-1], "cat") + self.assertEqual(third_top_k[0][-1], "shoes") + + def test_queue_n3_k1_insertion_with_confidence_scores(self): + """ + Tests proper queue insertion when objects are inserted as Tuples with confidence scores. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2 + ) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + [("mug", 0.1), ("computer", 0.8), ("dog", 0.5)], + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + [("ball", 0.9), ("butterfly", 0.3), ("cat", 0.5)], + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4, 2], + [("shoes", 0.9), ("pencil", 0.3), ("child", 0.5)], + ) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) + + queue_state = q.get_queue() + first_timestamped_item, second_timetsamped_item, third_timestamped_item = ( + queue_state[0], + queue_state[1], + queue_state[2], + ) + first_top_k, second_top_k, third_top_k = ( + first_timestamped_item[-1], + second_timetsamped_item[-1], + third_timestamped_item[-1], + ) + # Recall that each object is a List of Tuples: + # (centroid distance, (detected object, confidence score)) + _, obj_with_conf_score = first_top_k[0] + self.assertEqual(obj_with_conf_score[0], "dog") + _, obj_with_conf_score = second_top_k[0] + self.assertEqual(obj_with_conf_score[0], "cat") + _, obj_with_conf_score = third_top_k[0] + self.assertEqual(obj_with_conf_score[0], "shoes") + + def test_queue_n3_k2_insertion(self): + """ + Tests proper queue insertion when the top 2 objects are inserted as strings. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + ["mug", "computer", "dog"], + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + ["ball", "butterfly", "cat"], + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4, 2], + ["shoes", "pencil", "child"], + ) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) + + queue_state = q.get_queue() + first_timestamped_item, second_timetsamped_item, third_timestamped_item = ( + queue_state[0], + queue_state[1], + queue_state[2], + ) + first_top_k, second_top_k, third_top_k = ( + first_timestamped_item[-1], + second_timetsamped_item[-1], + third_timestamped_item[-1], + ) + # Recall that each object is a List of Tuples of (centroid distance, detected object) + + first_object_labels = [label for centroid, label in first_top_k] + self.assertEqual(["dog", "computer"], first_object_labels) + + second_object_labels = [label for centroid, label in second_top_k] + self.assertEqual(["cat", "butterfly"], second_object_labels) + + third_object_labels = [label for centroid, label in third_top_k] + self.assertEqual(["shoes", "pencil"], third_object_labels) + + def test_queue_n3_k2_removal(self): + """ + Tests proper queueing of the last 3 top 2 objects are inserted as strings. + """ + q = Centroid2DStrategyQueue( + n=1, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + ["mug", "computer", "dog"], + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + ["ball", "butterfly", "cat"], + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4, 2], + ["shoes", "pencil", "child"], + ) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) + + no_items = q.get_n_before(timestamp=1) + self.assertEqual([], no_items) + # Expects the last n=1 detections before timestamp 4. This should be timestamp 3's + # top k=2 objects. + last_n_top_k = q.get_n_before(timestamp=4) + discarded_timestamp, third_top_k_with_centroid_dist = last_n_top_k[0] + third_top_k = [item for dist, item in third_top_k_with_centroid_dist] + self.assertEqual(["shoes", "pencil"], third_top_k) + + def test_queue_n2_k2_removal_with_confidence_scores(self): + """ + Tests proper queue removal of the last 2 top 2 detected objects before a given timestamp + when the top 2 objects are inserted as strings with confidence scores. + """ + q = Centroid2DStrategyQueue( + n=2, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + [("mug", 0.1), ("computer", 0.8), ("dog", 0.5)], + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + [("ball", 0.9), ("butterfly", 0.3), ("cat", 0.5)], + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4, 2], + [("shoes", 0.9), ("pencil", 0.3), ("child", 0.5)], + ) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) + + no_items = q.get_n_before(timestamp=1) + self.assertEqual([], no_items) + # Expects the last n=2 detections before timestamp 4. This should be timestamp 2 and + # timestamp 3's top k=2 objects. + last_n_top_k = q.get_n_before(timestamp=4) + discarded_timestamp, first_top_k_with_centroid_dist = last_n_top_k[0] + first_scored_top_k = [ + scored_item + for discarded_dist, scored_item in first_top_k_with_centroid_dist + ] + first_top_k = [item for item, score in first_scored_top_k] + self.assertEqual(["cat", "butterfly"], first_top_k) + discarded_timestamp, second_top_k_with_centroid_dist = last_n_top_k[1] + second_scored_top_k = [ + scored_item + for discarded_dist, scored_item in second_top_k_with_centroid_dist + ] + second_top_k = [item for item, score in second_scored_top_k] + self.assertEqual(["shoes", "pencil"], second_top_k) + + def test_empty_queue(self): + """ + Tests proper get-behavior of an empty queue. + """ + q = Centroid2DStrategyQueue( + n=2, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + self.assertEqual([], q.get_n_before(timestamp=4)) + + +if __name__ == "__main__": + unittest.main() diff --git a/config/activity_labels/recipe_coffee.yaml b/config/activity_labels/recipe_coffee.yaml index 22ccea358..0fd0d5dea 100644 --- a/config/activity_labels/recipe_coffee.yaml +++ b/config/activity_labels/recipe_coffee.yaml @@ -69,7 +69,7 @@ labels: - id: 14 label: "pour-beans-filter" #full_str: "Transfer the grounds to the filter cone" - full_str: "pour the grounded coffee beans into the filter cone prepared in step 2" + full_str: "Pour the grounded coffee beans into the filter cone prepared in step 2" depends: [7, 13] - id: 15 label: "thermometer-turn-on" diff --git a/config/tasks/multi-task-config.yaml b/config/tasks/multi-task-config.yaml index 21daa9239..bfad10c19 100644 --- a/config/tasks/multi-task-config.yaml +++ b/config/tasks/multi-task-config.yaml @@ -14,11 +14,11 @@ tasks: - id: 0 label: "coffee" config_file: "./config/tasks/recipe_coffee.yaml" - active: true + active: false - id: 1 label: "tea" config_file: "./config/tasks/recipe_tea.yaml" - active: true + active: false - id: 2 label: "pinwheel" config_file: "./config/tasks/recipe_pinwheel.yaml" @@ -30,4 +30,4 @@ tasks: - id: 4 label: "dessert quesadilla" config_file: "./config/tasks/recipe_dessertquesadilla.yaml" - active: false + active: true diff --git a/config/tasks/recipe_coffee.yaml b/config/tasks/recipe_coffee.yaml index ad8e723cc..a5f851598 100644 --- a/config/tasks/recipe_coffee.yaml +++ b/config/tasks/recipe_coffee.yaml @@ -30,7 +30,7 @@ to create a quarter-circle. Place the paper filter in the dripper and spread ope label: "coffee-beans-to-grounds" full_str: "Weigh the coffee beans and grind until the coffee grounds are the consistency of coarse sand, about 20 seconds. Transfer the grounds to the filter cone." - activity_ids: [28, 29, 28, 30, 31, 32, 33] + activity_ids: [29, 28, 30, 31, 32, 33] - id: 5 label: "check-temp" full_str: "Check the temperature of the water." diff --git a/ros/angel_msgs/CMakeLists.txt b/ros/angel_msgs/CMakeLists.txt index e5abe06e3..bd9890ce4 100644 --- a/ros/angel_msgs/CMakeLists.txt +++ b/ros/angel_msgs/CMakeLists.txt @@ -28,6 +28,7 @@ set( message_files msg/AruiObject3d.msg msg/AruiUpdate.msg msg/AruiUserNotification.msg + msg/DialogueUtterance.msg msg/EyeGazeData.msg msg/HandJointPose.msg msg/HandJointPosesUpdate.msg diff --git a/ros/angel_msgs/msg/DialogueUtterance.msg b/ros/angel_msgs/msg/DialogueUtterance.msg new file mode 100644 index 000000000..49d3122ee --- /dev/null +++ b/ros/angel_msgs/msg/DialogueUtterance.msg @@ -0,0 +1,24 @@ +# +# Dialogue Utterance with additional information about the environmental state +# and user model. +# + +# The header primarily encapsulates when this message was emitted. +# The time component of this may be utilized as an identifier for this user +# intent and utterance. +std_msgs/Header header + +# Speech-to-text of the user utterance we have interpreted +string utterance_text + +# Below are optional fields + +# Canonical user intent that has been interpreted. "Canonical" in this context +# is to mean that this string may be used as an identifier of this type of +# user intent. Should be in the range [0,1] where 1.0 means absolute confidence. +string intent +float64 intent_confidence_score + +# Emotion classification. Should be in the range [0,1] where 1.0 means absolute confidence. +string emotion +float64 emotion_confidence_score diff --git a/ros/angel_msgs/msg/TaskUpdate.msg b/ros/angel_msgs/msg/TaskUpdate.msg index fdfd5005b..73fe8db56 100644 --- a/ros/angel_msgs/msg/TaskUpdate.msg +++ b/ros/angel_msgs/msg/TaskUpdate.msg @@ -20,6 +20,7 @@ int8 current_step_id # String of the step currently in progress. string current_step + # Previous step is the step worked on before the current step. string previous_step diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py index 5eb352e7b..a8bbc0e9f 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py @@ -11,7 +11,7 @@ from rclpy.node import Node import simpleaudio as sa -from angel_msgs.msg import HeadsetAudioData, Utterance +from angel_msgs.msg import HeadsetAudioData, DialogueUtterance AUDIO_TOPIC = "audio_topic" @@ -105,7 +105,9 @@ def __init__(self): self.subscription = self.create_subscription( HeadsetAudioData, self._audio_topic, self.listener_callback, 1 ) - self._publisher = self.create_publisher(Utterance, self._utterances_topic, 1) + self._publisher = self.create_publisher( + DialogueUtterance, self._utterances_topic, 1 + ) self.audio_stream = [] self.t = threading.Thread() @@ -204,15 +206,19 @@ def asr_server_request_thread(self, audio_data, num_channels, sample_rate): self.log.info("Complete ASR text is:\n" + f'"{response_text}"') if self._is_sentence_tokenize_mode: for sentence in sent_tokenize(response_text): - utterance_msg = Utterance() - utterance_msg.value = sentence + msg = DialogueUtterance() + msg.header.frame_id = "ASR" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = sentence self.log.info("Publishing message: " + f'"{sentence}"') - self._publisher.publish(utterance_msg) + self._publisher.publish(msg) else: - utterance_msg = Utterance() - utterance_msg.value = response_text + msg = DialogueUtterance() + msg.header.frame_id = "ASR" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = response_text self.log.info("Publishing message: " + f'"{response_text}"') - self._publisher.publish(utterance_msg) + self._publisher.publish(msg) def main(): diff --git a/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py b/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py new file mode 100644 index 000000000..8e72bfd4f --- /dev/null +++ b/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py @@ -0,0 +1,68 @@ +from abc import ABC +import rclpy +from rclpy.node import Node + +from angel_msgs.msg import DialogueUtterance + + +class BaseDialogueSystemNode(Node): + """ + This class is used for all dialogue system nodes to inherit similar + functionality. + """ + + def __init__(self): + super().__init__(self.__class__.__name__) + self.log = self.get_logger() + + def get_intent_or( + self, src_msg: DialogueUtterance, or_value: str = "not available" + ) -> str: + """ + Returns the src_msg intent classification information. If the value is absent, + the or_value is passed in. + """ + return src_msg.intent if src_msg.intent else or_value + + def get_emotion_or( + self, src_msg: DialogueUtterance, or_value: str = "not available" + ) -> str: + """ + Returns the src_msg emotion classification information. If the value is absent, + the or_value is passed in. + """ + return src_msg.emotion if src_msg.emotion else or_value + + def copy_dialogue_utterance( + self, src_msg: DialogueUtterance, node_name: str = "Dialogue System Node" + ) -> DialogueUtterance: + msg = DialogueUtterance() + msg.header.frame_id = node_name + msg.utterance_text = src_msg.utterance_text + + # Assign new time for publication. + msg.header.stamp = self.get_clock().now().to_msg() + + # Copy over intent classification information if present. + if src_msg.intent: + msg.intent = src_msg.intent + msg.intent_confidence_score = src_msg.intent_confidence_score + + # Copy over intent classification information if present. + if src_msg.emotion: + msg.emotion = src_msg.emotion + msg.emotion_confidence_score = src_msg.emotion_confidence_score + + return msg + + +def main(): + rclpy.init() + base_dialogue_node = BaseDialogueSystemNode() + rclpy.spin(base_dialogue_node) + base_dialogue_node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py index 255000739..996688610 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py @@ -5,11 +5,11 @@ import threading from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -from angel_msgs.msg import InterpretedAudioUserEmotion, InterpretedAudioUserIntent +from angel_msgs.msg import DialogueUtterance +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_utils import declare_and_get_parameters -IN_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" -IN_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" +IN_TOPIC = "input_topic" OUT_INTERP_USER_EMOTION_TOPIC = "user_emotion_topic" # Currently supported emotions. This is tied with the emotions @@ -23,45 +23,36 @@ VADER_POSITIVE_COMPOUND_THRESHOLD = 0.05 -class BaseEmotionDetector(Node): +class BaseEmotionDetector(BaseDialogueSystemNode): """ - As of Q22023, emotion detection is derived via VaderSentiment - (https://github.com/cjhutto/vaderSentiment). + This is the base emotion detection node that other emotion detection nodes + should inherit from. """ def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() # Handle parameterization. param_values = declare_and_get_parameters( self, [ - (IN_EXPECT_USER_INTENT_TOPIC,), - (IN_INTERP_USER_INTENT_TOPIC,), + (IN_TOPIC,), (OUT_INTERP_USER_EMOTION_TOPIC,), ], ) - self._in_expect_uintent_topic = param_values[IN_EXPECT_USER_INTENT_TOPIC] - self._in_interp_uintent_topic = param_values[IN_INTERP_USER_INTENT_TOPIC] + self._input_topic = param_values[IN_TOPIC] self._out_interp_uemotion_topic = param_values[OUT_INTERP_USER_EMOTION_TOPIC] - # Handle subscription/publication topics. - self.expect_uintent_subscription = self.create_subscription( - InterpretedAudioUserIntent, - self._in_expect_uintent_topic, - self.intent_detection_callback, + self.subscription = self.create_subscription( + DialogueUtterance, + self._input_topic, + self.emotion_detection_callback, 1, ) - self.interp_uintent_subscription = self.create_subscription( - InterpretedAudioUserIntent, - self._in_interp_uintent_topic, - self.intent_detection_callback, - 1, - ) - self._interp_emo_publisher = self.create_publisher( - InterpretedAudioUserEmotion, self._out_interp_uemotion_topic, 1 + self.emotion_publication = self.create_publisher( + DialogueUtterance, self._out_interp_uemotion_topic, 1 ) self.message_queue = queue.Queue() @@ -94,21 +85,19 @@ def _get_vader_sentiment_analysis(self, utterance: str): ) return (classification, confidence) - def get_inference(self, msg): + def get_inference(self, msg: DialogueUtterance): """ Abstract away the different model inference calls depending on the node's configure model mode. """ return self._get_vader_sentiment_analysis(msg.utterance_text) - def intent_detection_callback(self, msg): + def emotion_detection_callback(self, msg): """ This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ self.log.debug(f'Received message:\n\n"{msg.utterance_text}"') - if not self._apply_filter(msg): - return self.message_queue.put(msg) def process_message_queue(self): @@ -119,42 +108,28 @@ def process_message_queue(self): msg = self.message_queue.get() self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') classification, confidence_score = self.get_inference(msg) - self.publish_detected_emotion( - msg.utterance_text, classification, confidence_score - ) + self.publish_detected_emotion(msg, classification, confidence_score) def publish_detected_emotion( - self, utterance: str, classification: str, confidence_score: float + self, sub_msg: DialogueUtterance, classification: str, confidence_score: float ): """ Handles message publishing for an utterance with a detected emotion classification. """ - emotion_msg = InterpretedAudioUserEmotion() - emotion_msg.header.frame_id = "Emotion Detection" - emotion_msg.header.stamp = self.get_clock().now().to_msg() - emotion_msg.utterance_text = utterance - emotion_msg.user_emotion = classification - emotion_msg.confidence = confidence_score - self._interp_emo_publisher.publish(emotion_msg) - colored_utterance = colored(utterance, "light_blue") - colored_emotion = colored(classification, "light_green") + pub_msg = self.copy_dialogue_utterance(sub_msg, node_name="Emotion Detection") + # Overwrite the user emotion with the latest classification information. + pub_msg.emotion = classification + pub_msg.emotion_confidence_score = confidence_score + self.emotion_publication.publish(pub_msg) + + # Log emotion detection information. + colored_utterance = colored(pub_msg.utterance_text, "light_blue") + colored_emotion = colored(pub_msg.emotion, "light_green") self.log.info( f'Publishing {{"{colored_emotion}": {confidence_score}}} ' + f'to {self._out_interp_uemotion_topic} for:\n>>> "{colored_utterance}"' ) - def _apply_filter(self, msg): - """ - Abstracts away any filtering to apply on received messages. Return - none if the message should be filtered out. Else, return the incoming - msg if it can be included. - """ - # if msg.user_intent.lower() == "user inquiry": - # return msg - # else: - # return None - return msg - def main(): rclpy.init() diff --git a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py index 4d0afdacf..7dcc6d31e 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py @@ -4,7 +4,8 @@ from termcolor import colored import threading -from angel_msgs.msg import InterpretedAudioUserIntent, Utterance +from angel_msgs.msg import DialogueUtterance +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_utils import declare_and_get_parameters NEXT_STEP_KEYPHRASES = ["skip", "next", "next step"] @@ -16,40 +17,40 @@ # config/angel_system_cmds/user_intent_to_sys_cmd_v1.yaml. # Please refer to labels defined in # https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . -INTENT_LABELS = ["next_step", "prev_step", "inquiry", "other"] +INTENT_LABELS = ["next_step", "prev_step", "inquiry", "object_clarification", "other"] -UTTERANCES_TOPIC = "utterances_topic" +IN_TOPIC = "input_topic" PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" -class BaseIntentDetector(Node): +class BaseIntentDetector(BaseDialogueSystemNode): def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() # Handle parameterization. param_values = declare_and_get_parameters( self, [ - (UTTERANCES_TOPIC,), + (IN_TOPIC,), (PARAM_EXPECT_USER_INTENT_TOPIC,), (PARAM_INTERP_USER_INTENT_TOPIC,), ], ) - self._utterances_topic = param_values[UTTERANCES_TOPIC] + self._input_topic = param_values[IN_TOPIC] self._expect_uintent_topic = param_values[PARAM_EXPECT_USER_INTENT_TOPIC] self._interp_uintent_topic = param_values[PARAM_INTERP_USER_INTENT_TOPIC] # Handle subscription/publication topics. self.subscription = self.create_subscription( - Utterance, self._utterances_topic, self.utterance_callback, 1 + DialogueUtterance, self._input_topic, self.utterance_callback, 1 ) self._expected_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._expect_uintent_topic, 1 + DialogueUtterance, self._expect_uintent_topic, 1 ) self._interp_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._interp_uintent_topic, 1 + DialogueUtterance, self._interp_uintent_topic, 1 ) self.utterance_message_queue = queue.Queue() @@ -63,7 +64,7 @@ def utterance_callback(self, msg): This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ - self.log.debug(f'Received message:\n\n"{msg.value}"') + self.log.debug(f'Received message:\n\n"{msg.utterance_text}"') self.utterance_message_queue.put(msg) def process_utterance_message_queue(self): @@ -72,13 +73,13 @@ def process_utterance_message_queue(self): """ while True: msg = self.utterance_message_queue.get() - self.log.debug(f'Processing message:\n\n"{msg.value}"') + self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') intent, score = self.detect_intents(msg) if not intent: continue - self.publish_msg(msg.value, intent, score) + self.publish_msg(msg, intent, score) - def detect_intents(self, msg): + def detect_intents(self, msg: DialogueUtterance): """ Keyphrase search for intent detection. This implementation does simple string matching to assign a detected label. When multiple intents are @@ -98,7 +99,7 @@ def _tiebreak_intents(intents, confidences): ) return classification, score - lower_utterance = msg.value.lower() + lower_utterance = msg.utterance_text.lower() intents = [] confidences = [] if self._contains_phrase(lower_utterance, NEXT_STEP_KEYPHRASES): @@ -111,7 +112,7 @@ def _tiebreak_intents(intents, confidences): intents.append(INTENT_LABELS[2]) confidences.append(0.5) if not intents: - colored_utterance = colored(msg.value, "light_blue") + colored_utterance = colored(msg.utterance_text, "light_blue") self.log.info(f'No intents detected for:\n>>> "{colored_utterance}":') return None, -1.0 @@ -119,27 +120,27 @@ def _tiebreak_intents(intents, confidences): classification = colored(classification, "light_green") return classification, confidence - def publish_msg(self, utterance, intent, score): + def publish_msg(self, sub_msg: DialogueUtterance, intent: str, score: float): """ Handles message publishing for an utterance with a detected intent. """ - intent_msg = InterpretedAudioUserIntent() - intent_msg.header.frame_id = "Intent Detection" - intent_msg.header.stamp = self.get_clock().now().to_msg() - intent_msg.utterance_text = utterance - intent_msg.user_intent = intent - intent_msg.confidence = score + pub_msg = self.copy_dialogue_utterance(sub_msg, node_name="Intent Detection") + # Overwrite the user intent with the latest classification information. + pub_msg.intent = intent + pub_msg.intent_confidence_score = score + + # Decide which intent topic to publish the message to. published_topic = None - if self._contains_phrase(utterance.lower(), OVERRIDE_KEYPHRASES): - intent_msg.confidence = 1.0 - self._expected_publisher.publish(intent_msg) + if self._contains_phrase(pub_msg.utterance_text.lower(), OVERRIDE_KEYPHRASES): + pub_msg.intent_confidence_score = 1.0 + self._expected_publisher.publish(pub_msg) published_topic = PARAM_EXPECT_USER_INTENT_TOPIC - else: - self._interp_publisher.publish(intent_msg) - published_topic = PARAM_INTERP_USER_INTENT_TOPIC + self._interp_publisher.publish(pub_msg) + published_topic = PARAM_INTERP_USER_INTENT_TOPIC - colored_utterance = colored(utterance, "light_blue") - colored_intent = colored(intent_msg.user_intent, "light_green") + # Log intent detection information. + colored_utterance = colored(pub_msg.utterance_text, "light_blue") + colored_intent = colored(pub_msg.intent, "light_green") self.log.info( f'Publishing {{"{colored_intent}": {score}}} to {published_topic} ' + f'for:\n>>> "{colored_utterance}"' diff --git a/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py b/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py index 79714f821..3a9c8b873 100644 --- a/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py +++ b/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py @@ -15,6 +15,7 @@ SystemTextResponse, TaskUpdate, VisionBoundingBox3d, + DialogueUtterance, ) from angel_utils import declare_and_get_parameters @@ -26,6 +27,7 @@ PARAM_ARUI_UPDATE_TOPIC = "arui_update_topic" PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" PARAM_SYSTEM_TEXT_RESPONSE_TOPIC = "system_text_response_topic" +PARAM_UTTERANCE_TOPIC = "utterances_topic" class FeedbackGenerator(Node): @@ -55,6 +57,7 @@ def __init__(self): (PARAM_ARUI_UPDATE_TOPIC,), (PARAM_INTERP_USER_INTENT_TOPIC,), (PARAM_SYSTEM_TEXT_RESPONSE_TOPIC,), + (PARAM_UTTERANCE_TOPIC,), ], ) @@ -66,6 +69,9 @@ def __init__(self): self._system_text_response_topic = param_values[ PARAM_SYSTEM_TEXT_RESPONSE_TOPIC ] + self._utterance_topic = param_values[ + PARAM_UTTERANCE_TOPIC + ] # subscribers self.activity_subscriber = self.create_subscription( @@ -94,6 +100,13 @@ def __init__(self): 1, ) + self.utterance_subscriber = self.create_subscription( + DialogueUtterance, + self._utterance_topic, + self.utterance_callback, + 1, + ) + # publisher self.arui_update_publisher = self.create_publisher( AruiUpdate, self._arui_update_topic, 1 @@ -248,11 +261,44 @@ def system_text_response_callback(self, msg: SystemTextResponse) -> None: notification.category = notification.N_CAT_NOTICE notification.context = notification.N_CONTEXT_USER_MODELING - notification.title = f"System response for: {msg.utterance_text}" + notification.title = f"{msg.utterance_text}" notification.description = f"{msg.response}" self.publish_update(notifications=[notification]) + def utterance_callback(self, msg: DialogueUtterance) -> None: + """ + This is the main ROS node listener callback loop that will process + all messages received via subscribed topics. + """ + keyword_check = msg.utterance_text[0:8] + if "Angel," in keyword_check or "angel," in keyword_check or\ + "Angela," in keyword_check or "angela," in keyword_check or\ + "Angel" in keyword_check or "angel" in keyword_check or\ + "Angela" in keyword_check or "angela" in keyword_check: + arui_message = msg.utterance_text + arui_message= arui_message.replace("Angel, ", "") + arui_message= arui_message.replace("angel, ", "") + arui_message= arui_message.replace("Angela, ", "") + arui_message= arui_message.replace("angela, ", "") + arui_message= arui_message.replace("Angel ", "") + arui_message= arui_message.replace("angel ", "") + arui_message= arui_message.replace("Angela ", "") + arui_message= arui_message.replace("angela ", "") + arui_message= arui_message.capitalize() + + # Create an AruiUserNotification msg with this information + notification = AruiUserNotification() + + notification.category = notification.N_CAT_NOTICE + notification.context = notification.N_CONTEXT_USER_MODELING + + notification.title = f"{arui_message}" + notification.description = "" + + self.publish_update(notifications=[notification]) + + def main(): rclpy.init() diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py index 19a15fdd5..c304c9117 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py @@ -5,7 +5,9 @@ import os import rclpy +from angel_msgs.msg import DialogueUtterance from angel_system_nodes.base_emotion_detector import BaseEmotionDetector, LABEL_MAPPINGS +from angel_utils import declare_and_get_parameters openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") @@ -14,18 +16,28 @@ FEW_SHOT_EXAMPLES = [ { "utterance": "Go back to the previous step you dumb machine!", - "label": "negative.", + "label": "negative[eos]", }, - {"utterance": "Next step, please.", "label": "neutral"}, - {"utterance": "We're doing great and I'm learning a lot!", "label": "positive"}, + {"utterance": "Next step, please.", "label": "neutral[eos]"}, + {"utterance": "We're doing great and I'm learning a lot!", "label": "positive[eos]"}, ] +PARAM_TIMEOUT = "timeout" + class GptEmotionDetector(BaseEmotionDetector): def __init__(self): super().__init__() self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (PARAM_TIMEOUT, 10), + ], + ) + self.timeout = 10 + # This node additionally includes fields for interacting with OpenAI # via LangChain. if not os.getenv("OPENAI_API_KEY"): @@ -77,14 +89,16 @@ def _labels_list_str(labels): openai_api_key=self.openai_api_key, temperature=0.0, max_tokens=1, + request_timeout=self.timeout, ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) - def get_inference(self, msg): + def get_inference(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ - return (self.chain.run(utterance=msg.utterance_text), 0.5) + emotion = self.chain.run(utterance=msg.utterance_text) + return emotion.split('[eos]')[0], 0.5 def main(): diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py index e09f6bffa..b41a0d9ea 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py @@ -6,25 +6,38 @@ import os import rclpy +from angel_msgs.msg import DialogueUtterance from angel_system_nodes.base_intent_detector import BaseIntentDetector, INTENT_LABELS +from angel_utils import declare_and_get_parameters openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") # The following are few shot examples when prompting GPT. FEW_SHOT_EXAMPLES = [ - {"utterance": "Go back to the previous step!", "label": "prev_step."}, - {"utterance": "Next step, please.", "label": "next_step"}, - {"utterance": "How should I wrap this tourniquet?", "label": "inquiry"}, - {"utterance": "The sky is blue", "label": "other"}, + {"utterance": "Go back to the previous step!", "label": "prev_step[eos]"}, + {"utterance": "Next step, please.", "label": "next_step[eos]"}, + {"utterance": "How should I wrap this tourniquet?", "label": "inquiry[eos]"}, + {"utterance": "The sky is blue", "label": "other[eos]"}, + {"utterance": "What is this thing?", "label": "object_clarification[eos]"}, ] +PARAM_TIMEOUT = "timeout" + class GptIntentDetector(BaseIntentDetector): def __init__(self): super().__init__() self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (PARAM_TIMEOUT, 600), + ], + ) + self.timeout = param_values[PARAM_TIMEOUT] + # This node additionally includes fields for interacting with OpenAI # via LangChain. if not os.getenv("OPENAI_API_KEY"): @@ -77,15 +90,17 @@ def _labels_list_str(labels): temperature=0.0, # Only 2 tokens needed for classification (tokens are delimited by use of '_', i.e. # 'next_step' counts as 2 tokens). - max_tokens=2, + # max_tokens=10, + request_timeout=self.timeout, ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) - def detect_intents(self, msg): + def detect_intents(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ - return self.chain.run(utterance=msg), 0.5 + intent = self.chain.run(utterance=msg.utterance_text) + return intent.split('[eos]')[0], 0.5 def main(): diff --git a/ros/angel_system_nodes/angel_system_nodes/intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/intent_detector.py deleted file mode 100644 index 618cf02ca..000000000 --- a/ros/angel_system_nodes/angel_system_nodes/intent_detector.py +++ /dev/null @@ -1,133 +0,0 @@ -import rclpy -from rclpy.node import Node - -from angel_msgs.msg import InterpretedAudioUserIntent, Utterance - -# Please refer to labels defined in -# https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . -NEXT_STEP_KEYPHRASES = ["skip", "next", "next step"] -PREV_STEP_KEYPHRASES = ["previous", "previous step", "last step", "go back"] -OVERRIDE_KEYPHRASES = ["angel", "angel system"] - -# TODO(derekahmed): Please figure out how to keep this sync-ed with -# config/angel_system_cmds/user_intent_to_sys_cmd_v1.yaml. -LABELS = ["Go to next step", "Go to previous step"] - - -UTTERANCES_TOPIC = "utterances_topic" -PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" -PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" - - -class IntentDetector(Node): - """ - As of Q12023, intent detection is derived heuristically. This will be shifted - to a model-based approach in the near-future. - """ - - def __init__(self): - super().__init__(self.__class__.__name__) - self.log = self.get_logger() - - parameter_names = [ - UTTERANCES_TOPIC, - PARAM_EXPECT_USER_INTENT_TOPIC, - PARAM_INTERP_USER_INTENT_TOPIC, - ] - set_parameters = self.declare_parameters( - namespace="", - parameters=[(p,) for p in parameter_names], - ) - # Check for not-set parameters - some_not_set = False - for p in set_parameters: - if p.type_ is rclpy.parameter.Parameter.Type.NOT_SET: - some_not_set = True - self.log.error(f"Parameter not set: {p.name}") - if some_not_set: - raise ValueError("Some parameters are not set.") - - self._utterances_topic = self.get_parameter(UTTERANCES_TOPIC).value - self._expect_uintent_topic = self.get_parameter( - PARAM_EXPECT_USER_INTENT_TOPIC - ).value - self._interp_uintent_topic = self.get_parameter( - PARAM_INTERP_USER_INTENT_TOPIC - ).value - self.log.info( - f"Utterances topic: " - f"({type(self._utterances_topic).__name__}) " - f"{self._utterances_topic}" - ) - self.log.info( - f"Expected User Intent topic: " - f"({type(self._expect_uintent_topic).__name__}) " - f"{self._expect_uintent_topic}" - ) - self.log.info( - f"Interpreted User Intent topic: " - f"({type(self._interp_uintent_topic).__name__}) " - f"{self._interp_uintent_topic}" - ) - - # TODO(derekahmed): Add internal queueing to reduce subscriber queue - # size to 1. - self.subscription = self.create_subscription( - Utterance, self._utterances_topic, self.listener_callback, 10 - ) - - self._expected_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._expect_uintent_topic, 1 - ) - - self._interp_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._interp_uintent_topic, 1 - ) - - def listener_callback(self, msg): - log = self.get_logger() - intent_msg = InterpretedAudioUserIntent() - intent_msg.utterance_text = msg.value - - lower_utterance = msg.value.lower() - if self.contains_phrase(lower_utterance, NEXT_STEP_KEYPHRASES): - intent_msg.user_intent = LABELS[0] - intent_msg.confidence = 0.5 - elif self.contains_phrase(lower_utterance, PREV_STEP_KEYPHRASES): - intent_msg.user_intent = LABELS[1] - intent_msg.confidence = 0.5 - else: - log.info(f'Detected no intents for "{msg.value}":') - return - - if self.contains_phrase(lower_utterance, OVERRIDE_KEYPHRASES): - intent_msg.confidence = 1.0 - self._expected_publisher.publish(intent_msg) - else: - self._interp_publisher.publish(intent_msg) - - log.info( - f'Detected intents for "{msg.value}":\n' - + f'"{intent_msg.user_intent}": {intent_msg.confidence}' - ) - - def contains_phrase(self, utterance, phrases): - for phrase in phrases: - if phrase in utterance: - return True - return False - - -def main(): - rclpy.init() - - intentDetector = IntentDetector() - - rclpy.spin(intentDetector) - - intentDetector.destroy_node() - rclpy.shutdown() - - -if __name__ == "__main__": - main() diff --git a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py index 7572f3641..d7c496b76 100644 --- a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py @@ -8,33 +8,37 @@ from termcolor import colored import threading -from angel_msgs.msg import InterpretedAudioUserEmotion, SystemTextResponse +from angel_msgs.msg import DialogueUtterance, SystemTextResponse +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_utils import declare_and_get_parameters openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") -IN_EMOTION_TOPIC = "user_emotion_topic" +INPUT_TOPIC = "input_topic" OUT_QA_TOPIC = "system_text_response_topic" FEW_SHOT_PROMPT = "few_shot_prompt_file" +PARAM_TIMEOUT = "timeout" -class QuestionAnswerer(Node): +class QuestionAnswerer(BaseDialogueSystemNode): def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() param_values = declare_and_get_parameters( self, [ - (IN_EMOTION_TOPIC,), + (INPUT_TOPIC,), (OUT_QA_TOPIC,), (FEW_SHOT_PROMPT,), + (PARAM_TIMEOUT, 600), ], ) - self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] + self._input_topic = param_values[INPUT_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] self.prompt_file = param_values[FEW_SHOT_PROMPT] + self.timeout = param_values[PARAM_TIMEOUT] self.question_queue = queue.Queue() self.handler_thread = threading.Thread(target=self.process_question_queue) @@ -58,8 +62,8 @@ def __init__(self): # Handle subscription/publication topics. self.subscription = self.create_subscription( - InterpretedAudioUserEmotion, - self._in_emotion_topic, + DialogueUtterance, + self._input_topic, self.question_answer_callback, 1, ) @@ -67,7 +71,7 @@ def __init__(self): SystemTextResponse, self._out_qa_topic, 1 ) - def get_response(self, user_utterance: str, user_emotion: str): + def get_response(self, sub_msg: DialogueUtterance): """ Generate a response to the utterance, enriched with the addition of the user's detected emotion. Inference calls can be added and revised @@ -77,14 +81,14 @@ def get_response(self, user_utterance: str, user_emotion: str): try: if self.is_openai_ready: return_msg = colored( - self.prompt_gpt(user_utterance) + "\n", "light_green" + self.prompt_gpt(sub_msg.utterance_text) + "\n", "light_green" ) except RuntimeError as err: self.log.info(err) colored_apology = colored( "I'm sorry. I don't know how to answer your statement.", "light_red" ) - colored_emotion = colored(user_emotion, "light_red") + colored_emotion = colored(sub_msg.emotion, "light_red") return_msg = ( f"{colored_apology} I understand that you feel {colored_emotion}." ) @@ -106,23 +110,22 @@ def process_question_queue(self): """ while True: msg = self.question_queue.get() - emotion = msg.user_emotion - response = self.get_response(msg.utterance_text, emotion) - self.publish_generated_response(msg.utterance_text, response) - - def publish_generated_response(self, utterance: str, response: str): - msg = SystemTextResponse() - msg.header.frame_id = "GPT Question Answering" - msg.header.stamp = self.get_clock().now().to_msg() - msg.utterance_text = utterance - msg.response = response - colored_utterance = colored(utterance, "light_blue") + response = self.get_response(msg) + self.publish_generated_response(msg, response) + + def publish_generated_response(self, sub_msg: DialogueUtterance, response: str): + pub_msg = SystemTextResponse() + pub_msg.header.frame_id = "GPT Question Answering" + pub_msg.header.stamp = self.get_clock().now().to_msg() + pub_msg.utterance_text = sub_msg.utterance_text + pub_msg.response = response + colored_utterance = colored(sub_msg.utterance_text, "light_blue") colored_response = colored(response, "light_green") self.log.info( f'Responding to utterance:\n>>> "{colored_utterance}"\n>>> with:\n' + f'>>> "{colored_response}"' ) - self._qa_publisher.publish(msg) + self._qa_publisher.publish(pub_msg) def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): prompt = self.prompt.format(question) @@ -137,6 +140,7 @@ def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): "https://api.openai.com/v1/chat/completions", json=payload, headers={"Authorization": "Bearer {}".format(self.openai_api_key)}, + timeout=self.timeout, ) return ( json.loads(req.text)["choices"][0]["message"]["content"] diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py new file mode 100644 index 000000000..57c9243de --- /dev/null +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -0,0 +1,621 @@ +from enum import Enum +import itertools +import langchain +from langchain.chains import LLMChain +import json +from langchain.chat_models import ChatOpenAI +import openai +from operator import itemgetter +import os +import queue +import rclpy +from termcolor import colored +import threading +from typing import * + +from angel_msgs.msg import ( + ActivityDetection, + DialogueUtterance, + ObjectDetection2dSet, + SystemTextResponse, + TaskUpdate, +) +from angel_utils import declare_and_get_parameters +from angel_system.data.common import bounding_boxes +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode +from angel_system_nodes.base_intent_detector import INTENT_LABELS +from angel_system.utils.object_detection_queues import centroid_2d_strategy_queue + +openai.organization = os.getenv("OPENAI_ORG_ID") +openai.api_key = os.getenv("OPENAI_API_KEY") + +# Below is/are the subscribed topic(s). +IN_UTTERANCE_TOPIC = "utterance_topic" +IN_OBJECT_DETECTION_TOPIC = "object_detections_topic" +IN_ACT_CLFN_TOPIC = "action_classifications_topic" +IN_TASK_STATE_TOPIC = "task_state_topic" + +# Below is/are the published topic(s). +OUT_QA_TOPIC = "system_text_response_topic" + +# Below is used to filter out incoming questions. Toggle this parameter to True if questions +# are only responded to if they contain the TARGET_PHRASE. +PARAM_MUST_CONTAIN_TARGET_PHRASE = "must_contain_target_phrase" +TARGET_PHRASE = "angel" + +# Below indicates how many of the last n detected objects should be surfaced +# in the LLM prompt. These objects do NOT have to be unique. +PARAM_OBJECT_LAST_N_OBJ_DETECTIONS = "obj_det_last_n" + +# Comma-delimited list of objects to ignore. +PARAM_OBJECT_DETECTION_IGNORABLES = "object_det_ignored_objects" + +# Below are the corresponding model thresholds. +PARAM_OBJECT_DETECTION_THRESHOLD = "object_det_threshold" +PARAM_ACT_CLFN_THRESHOLD = "action_classification_threshold" + +# Below is the recipe paths for the intended task. +PARAM_RECIPE_PATH = "recipe_path" +# Below is the recipe paths for the prompt template. +PARAM_PROMPT_TEMPLATE_PATH = "prompt_template_path" +# Below is how many dialogue turns to keep maintained in the prompt context. +PARAM_CONTEXT_HISTORY_LENGTH = "context_history_length" + +# Below configures the width and height of an image. A typical example would be 1280 * 720. +PARAM_IMAGE_WIDTH = "pv_width" +PARAM_IMAGE_HEIGHT = "pv_height" +PARAM_DEBUG_MODE = "debug_mode" + +# Below are all the variables. These should correspond to the variables defined in the +# PROMPT_TEMPLATE_PATH and will be indicated by surrounding '{' and '}'. +PROMPT_VARIABLES = [ + "recipe", + "chat_history", + "optional_fields", + "centered_observables", + "all_observables", + "question", +] + +# Below configures the GPT request timeout in seconds. +PARAM_TIMEOUT = "timeout" + +class VisualQuestionAnswerer(BaseDialogueSystemNode): + class TimestampedEntity: + """ + This class is used internally as a container for recorded detections and classifications at + specific instances in time. + """ + + def __init__(self, time, entity): + self.time = time + self.entity = entity + + def __init__(self): + super().__init__() + self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (IN_UTTERANCE_TOPIC,), + (IN_TASK_STATE_TOPIC, ""), + (IN_OBJECT_DETECTION_TOPIC, ""), + (IN_ACT_CLFN_TOPIC, ""), + (PARAM_RECIPE_PATH,), + (PARAM_PROMPT_TEMPLATE_PATH,), + (PARAM_IMAGE_WIDTH,), + (PARAM_IMAGE_HEIGHT,), + (PARAM_OBJECT_DETECTION_IGNORABLES, ""), + (PARAM_OBJECT_LAST_N_OBJ_DETECTIONS, 5), + (PARAM_OBJECT_DETECTION_THRESHOLD, 0.8), + (PARAM_ACT_CLFN_THRESHOLD, 0.8), + (OUT_QA_TOPIC,), + (PARAM_CONTEXT_HISTORY_LENGTH, 3), + (PARAM_MUST_CONTAIN_TARGET_PHRASE, False), + (PARAM_TIMEOUT, 10), + (PARAM_DEBUG_MODE, False), + ], + ) + self._in_utterance_topic = param_values[IN_UTTERANCE_TOPIC] + self._in_task_state_topic = param_values[IN_TASK_STATE_TOPIC] + self._in_objects_topic = param_values[IN_OBJECT_DETECTION_TOPIC] + self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] + self._out_qa_topic = param_values[OUT_QA_TOPIC] + self.dialogue_history_length = param_values[PARAM_CONTEXT_HISTORY_LENGTH] + self.timeout = param_values[PARAM_TIMEOUT] + self.debug_mode = False + if param_values[PARAM_DEBUG_MODE]: + self.debug_mode = True + + self.param_must_contain_target_phrase = param_values[ + PARAM_MUST_CONTAIN_TARGET_PHRASE + ] + + # Used to obtain the center perspective point and how far detected objects + # are from it. + self.pv_width = param_values[PARAM_IMAGE_WIDTH] + self.pv_height = param_values[PARAM_IMAGE_HEIGHT] + pv_configured = self.pv_width > 0 and self.pv_height > 0 + self.pv_center_coordinate = ( + [self.pv_width / 2, self.pv_height / 2] if pv_configured else [None, None] + ) + + # Read the configured recipe file. + self._recipe_path = param_values[PARAM_RECIPE_PATH] + self.recipe = self._configure_recipe(self._recipe_path) + self.log.info( + colored( + f"Configured recipe to be: ~~~~~~~~~~\n{self.recipe}\n~~~~~~~~~~", + "light_red" + ) + ) + + # Read the configured prompt template. + self._prompt_template_path = param_values[PARAM_PROMPT_TEMPLATE_PATH] + with open(self._prompt_template_path, "r") as file: + self.prompt_template = file.read() + self.log.info( + colored( + f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~", + "light_red" + ) + ) + + self.object_dtctn_ignorables = set( + [ + s.strip() + for s in param_values[PARAM_OBJECT_DETECTION_IGNORABLES].split(",") + ] + ) + self.log.info( + colored( + f"Will be ignoring the following objects: {self.object_dtctn_ignorables}", + "light_red", + ) + ) + self.object_dtctn_threshold = param_values[PARAM_OBJECT_DETECTION_THRESHOLD] + self.object_dtctn_last_n_obj_detections = param_values[ + PARAM_OBJECT_LAST_N_OBJ_DETECTIONS + ] + + # Configure supplemental input action classification criteria. + self.action_clfn_threshold = param_values[PARAM_ACT_CLFN_THRESHOLD] + + # Configure supplemental input resources. + self.question_queue = queue.Queue() + self.current_step = None + self.completed_steps = None + self.action_classification_queue = queue.Queue() + self.detected_objects_queue = queue.Queue() + self.centroid_object_queue = centroid_2d_strategy_queue.Centroid2DStrategyQueue( + self.object_dtctn_last_n_obj_detections, + self.pv_center_coordinate[0], + self.pv_center_coordinate[1], + k=1, # the number of top-k objects to obtain from each detection. + ) + + self.dialogue_history = [] + self.handler_thread = threading.Thread(target=self.process_question_queue) + self.handler_thread.start() + + # Configure the (necessary) emotional detection enriched utterance subscription. + self.subscription = self.create_subscription( + DialogueUtterance, + self._in_utterance_topic, + self.question_answer_callback, + 1, + ) + # Configure the optional task updates subscription. + self.task_state_subscription = None + if self._in_task_state_topic: + self.task_state_subscription = self.create_subscription( + TaskUpdate, + self._in_task_state_topic, + self._set_current_step, + 1, + ) + + # Configure the optional object detection subscription. + self.objects_subscription = None + if self._in_objects_topic: + self.objects_subscription = self.create_subscription( + ObjectDetection2dSet, + self._in_objects_topic, + self._add_detected_objects, + 1, + ) + # Configure the optional action classification subscription. + self.action_subscription = None + if self.action_subscription: + self.action_subscription = self.create_subscription( + ActivityDetection, + self._in_actions_topic, + self._add_action_classification, + 1, + ) + # Configure the sole QA output of this node. + self._qa_publisher = self.create_publisher( + SystemTextResponse, self._out_qa_topic, 1 + ) + + # Configure OpenAI API. + self.openai_api_key = self._configure_openai_api_key() + self.openai_org_id = self._configure_openai_org_id() + + # Configure LangChain. + self.chain = self._configure_langchain() + + def _configure_openai_org_id(self): + if not os.getenv("OPENAI_ORG_ID"): + raise ValueError( + "OPENAI_ORG_ID environment variable is unset. " + + f"You should at least set it to garbage output." + ) + return os.getenv("OPENAI_ORG_ID") + + def _configure_openai_api_key(self): + if not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "OPENAI_API_KEY environment variable is unset. " + + f"You should at least set it to garbage output." + ) + return os.getenv("OPENAI_API_KEY") + + def _configure_recipe(self, recipe_path: str): + """ + Reads a recipe from a JSON file. The top-level keys in this file should correspond + to each of the steps for a determined task. The next level should contain an "index" + field to indicate the step number. + """ + f = open(recipe_path) + data = json.load(f) + steps = [None] * len(data.keys()) + for step in data.keys(): + idx = data[step]["index"] + steps[idx] = f"{idx + 1}. {step}" + return "\n".join(steps) + + def _configure_langchain(self): + """ + Handles OpenAI API prompting via LangChain. + """ + openai_llm = ChatOpenAI( + model_name="gpt-3.5-turbo", + openai_api_key=self.openai_api_key, + temperature=0.0, + max_tokens=64, + request_timeout=self.timeout, + ) + zero_shot_prompt = langchain.PromptTemplate( + input_variables=PROMPT_VARIABLES, + template=self.prompt_template, + ) + zero_shot_example = langchain.PromptTemplate.from_template("Tell me a joke") + + return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) + + def _get_sec(self, msg: DialogueUtterance) -> int: + return msg.header.stamp.sec + + def _set_current_step(self, msg: TaskUpdate): + self.current_step = msg.current_step_id + self.completed_steps = msg.completed_steps + + def _get_current_step(self): + return self.current_step + + def _get_completed_steps(self): + return self.completed_steps + + def _get_dialogue_history(self): + """ + Gets a string concatenation of the last self.dialogue_history_length turns of conversation. + """ + last_n = min(len(self.dialogue_history), self.dialogue_history_length) + last_n_turns = self.dialogue_history[-1 * last_n :] + return "\n".join(itertools.chain.from_iterable(last_n_turns)) + + def _add_action_classification(self, msg: ActivityDetection) -> str: + """ + Stores the action label with the highest confidence in + self.action_classification_queue. + """ + action_classification = max( + zip(msg.label_vec, msg.conf_vec), key=itemgetter(1) + )[0] + te = VisualQuestionAnswerer.TimestampedEntity( + self._get_sec(msg), action_classification + ) + self.action_classification_queue.put(te) + + def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: + """ + Stores all detected objects with a confidence score above IN_OBJECT_DETECTION_THRESHOLD. + """ + # We queue timestamped lists of pairs of (detections, confidence scores) for centered + # objects based on centroid distance from the middle. + self.centroid_object_queue.add( + self._get_sec(msg), + bounding_boxes.BoundingBoxes( + msg.left, + msg.right, + msg.top, + msg.bottom, + item=list(zip(msg.label_vec, msg.label_confidences)), + ), + ) + + # We queue ALL objects above threshold, regardless if they are centered in the user's + # perspective. + self._add_detected_objects_above_threshold(msg) + + def _add_detected_objects_above_threshold(self, msg): + """ + Queuse all objects above a configured threshold. + """ + detected_objs = set() + for obj, score in zip(msg.label_vec, msg.label_confidences): + if score < self.object_dtctn_threshold: + continue + detected_objs.add(obj) + if detected_objs: + te = VisualQuestionAnswerer.TimestampedEntity( + self._get_sec(msg), detected_objs + ) + self.detected_objects_queue.put(te) + + def _add_dialogue_history(self, question: str, response: str, emotion: str): + self.dialogue_history.append((f"Me ({emotion}): {question}", f"You: {response}")) + + def _get_latest_action(self, curr_time: int) -> str: + """ + Returns the latest action classification in self.action_classification_queue + that does not occur before a provided time. + """ + latest_action = "" + while not self.action_classification_queue.empty(): + next = self.action_classification_queue.queue[0] + if next.time < curr_time: + latest_action = next.entity + self.action_classification_queue.get() + else: + break + return latest_action + + def _get_latest_centered_observables(self, curr_time: int) -> Set: + """ + Returns a comma-delimited list of "centered" objects per all + entities in self.detected_objects_queue that occurred before a provided time. + :param curr_time: The time for which objects must have been detected before. + :param n: The last n objects. + :return: returns a string-ified list of the latest observables + """ + observables = set() + # handle 2D centroid distance queueing. + timestamped_detections = self.centroid_object_queue.get_n_before( + timestamp=curr_time + ) + if timestamped_detections: + if self.debug_mode: + self.log.info("Timestamped detections based on centroid distance are: ") + for detection in timestamped_detections: + self.log.info( + f"- Timestamp = {detection[0]} Centroid-Dist-Object(s) = {detection[1]}") + # Recall that we passed in timestamped lists of pairs of + # (detection, confidence score). + centered_obj_detections_lists = [j for _, j in timestamped_detections] + for centered_obj_detections in centered_obj_detections_lists: + for centered_obj_detection in centered_obj_detections: + centroid, obj_score = centered_obj_detection + obj, score = obj_score + observables.add(obj) + observables = observables - self.object_dtctn_ignorables + self.log.info(f"CENTERED OBJECTS:" + str(observables)) + return observables + else: + return "nothing" + + def _get_latest_observables(self, curr_time: int, n: int) -> Set: + """ + Returns a comma-delimited list of all observed objects per all + entities in self.detected_objects_queue that occurred before a provided time. + Only refers to the latest n detections. + :param curr_time: The time for which objects must have been detected before. + :param n: The last n objects. + :return: returns a string-ified list of the latest observables + """ + detections = [] + while not self.detected_objects_queue.empty(): + next_detections = self.detected_objects_queue.queue[0] + if next_detections.time < curr_time: + detections.append(self.detected_objects_queue.get()) + else: + break + observables = set() + for detection in detections[-n:]: + for obj in detection.entity: + observables.add(obj) + observables = observables - self.object_dtctn_ignorables + self.log.info(f"ALL OBJECTS:" + str(observables)) + return observables + + def get_response( + self, + msg: DialogueUtterance, + chat_history: str, + centered_observables: str, + all_observables: str, + optional_fields: str + ): + """ + Generate a response to the utterance, enriched with the addition of + the user's detected emotion, chat history, current step information, action, and + detected objects. Inference calls can be added and revised here. + """ + return_string = None + try: + self.log.info(f"User emotion: {msg.emotion}") + return_string = self.chain.run( + recipe=self.recipe, + optional_fields=optional_fields, + centered_observables=centered_observables, + all_observables=all_observables, + chat_history=chat_history, + question=f"Me ({msg.emotion}): {msg.utterance_text}", + ) + if self.debug_mode: + sent_prompt = self.chain.prompt.format_prompt( + recipe=self.recipe, + optional_fields=optional_fields, + centered_observables=centered_observables, + all_observables=all_observables, + chat_history=chat_history, + question=f"Me ({msg.emotion}): {msg.utterance_text}", + ).to_string() + sent_prompt = colored(sent_prompt, "light_blue") + self.log.info( + f"Prompt sent over:~~~~~~~~~~\n{sent_prompt}\n:~~~~~~~~~~" + ) + except RuntimeError as err: + self.log.info(err) + return_string = "I'm sorry. I don't know how to answer your question." + return return_string + + def question_answer_callback(self, msg: DialogueUtterance): + """ + This is the main ROS node listener callback loop that will process + all messages received via subscribed topics. + """ + self.log.info(f"Received message: \"{msg.utterance_text}\"") + if not self._apply_filter(msg): + return + + utt = msg.utterance_text + res = utt.split("Angel", 1) + if len(res)==1: + res = utt.split("angel", 1) + + splitString = res[1] + splitString = splitString.lstrip(',') + splitString = splitString.lstrip(' ') + + if len(splitString)>1: + msg.utterance_text = splitString.capitalize() + self.question_queue.put(msg) + + def _get_optional_fields_string(self, current_step: int, completed_steps: list) -> str: + optional_fields_string = "\n" + + if current_step==None: + #non started case + optional_fields_string += "I didn't start the recipe yet." + else: + if completed_steps[-1]==True: + #the last step is finished + optional_fields_string += f"I am done with all steps." + elif current_step==0: + #user is at step 1 + optional_fields_string += f"I am doing {current_step+1}" + optional_fields_string += f" and I am about to do {current_step+2}" + else: + optional_fields_string += f"I am doing {current_step+1}" + if current_step<=len(completed_steps)-2: + optional_fields_string += f" and I am about to do {current_step+2}" + + return optional_fields_string.rstrip("\n") + + def process_question_queue(self): + """ + Constant loop to process received questions. + """ + self.log.info("Spawning question-processing thread...") + while True: + question_msg = self.question_queue.get() + start_time = self._get_sec(question_msg) + self.log.info(f"Processing utterance \"{question_msg.utterance_text}\"") + + # Get the optional fields. + optional_fields = \ + self._get_optional_fields_string(self._get_current_step(),self._get_completed_steps()) + # Get centered detected objects. + centered_observables = self._get_latest_centered_observables(start_time) + # Get all detected objects. + all_observables = \ + self._get_latest_observables(start_time, self.object_dtctn_last_n_obj_detections) + + self.log.info(f"Current action detected: \"{self._get_latest_action(start_time)}\"") + response = None + is_object_clarification = \ + question_msg.intent and question_msg.intent == INTENT_LABELS[3] + if is_object_clarification and len(all_observables) > 1: + # Object Clarification override: If an associated intent exists and indicates + # object clarification in the presence of multiple objects, override the response with + # a clarification question. + self.log.info( + "Received confusing object clarification question from user " +\ + f"about multiple objects: ({all_observables}). " +\ + "Inquiring for more details...") + response = "I am seeing the following objects: " + for obs in all_observables: + response+= f"{str(obs)}, " + response.rsplit(",") + response +="What object are you referring to?" + self._add_dialogue_history("What is this?", response,"neutral") + elif is_object_clarification and len(all_observables) == 0: + output = "I don't see any objects. Could you look at it directly and ask again?" + self.log.info(output) + response = output + self._add_dialogue_history("What is this?", response, "neutral") + elif is_object_clarification and len(all_observables) == 1: + response = f"I think that is a {list(all_observables)[0]}?" + self._add_dialogue_history("What is this?", response, "neutral") + else: + # Normal response generation. + response = self.get_response( + question_msg, + self._get_dialogue_history(), + "", + ", ".join(all_observables) if len(all_observables) > 0 else "nothing", + optional_fields + ) + self._add_dialogue_history(question_msg.utterance_text, response,self.get_emotion_or(question_msg)) + + self.publish_generated_response(question_msg.utterance_text, response) + + + def publish_generated_response(self, utterance: str, response: str): + msg = SystemTextResponse() + msg.header.frame_id = "GPT Question Answering" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = utterance + msg.response = response + colored_utterance = colored(utterance, "magenta") + colored_response = colored(response, "light_green") + self.log.info( + f'Responding to utterance:\n>>> "{colored_utterance}"\n>>> with:\n' + + f'>>> "{colored_response}"' + ) + self._qa_publisher.publish(msg) + + def _apply_filter(self, msg): + """ + Abstracts away any filtering to apply on received messages. Return + a boolean value indicating if the message passes a filter and should be processed. + """ + if self.param_must_contain_target_phrase: + return TARGET_PHRASE in msg.utterance_text.lower() or "angela" in msg.utterance_text.lower() + + else: + return True + + +def main(): + rclpy.init() + question_answerer = VisualQuestionAnswerer() + rclpy.spin(question_answerer) + question_answerer.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt new file mode 100644 index 000000000..b1df5ffdf --- /dev/null +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -0,0 +1,24 @@ +### +Pretend you are a professional chef teaching me how to best make the recipe below. You can tell me how to use the utensils or so certain actions, let me know what is the current status of the recipe, let me know what objects are in front of me and let me know about alternative paths of the recipe if utensils are not available. + +When you answer my question, follow the these rules: +* Use information from the recipe below and the available objects. +* Is should not deviate from the instructions, except there ingredients or utensils are not available. +* If I ask you something unrelated to cooking or the recipe, answer with: "Sorry, I can't help you with that". +* You should always resond in a conversational tone. +* DO NOT ANSWER "I'm sorry, I am an AI language model and I cannot see or perceive anything." +* You can differentiate between objects you see in the environment and perceive them as well. +* Don't use the phrasing "However,.." +* Don't say "Based on the information you provided," + +Here are the instructions of the recipe you are trying to teach me: +{recipe} +{optional_fields} + +Here are the objects that are directly accessible to me: {all_observables} {centered_observables}. Objects not mentioned in this list are not directly in my environment. Objects in the environment which are not mentioned in the recipe above, are not relevant. +### + +Here is our conversation history: +{chat_history} +{question} +Your answer (very short, precise, helpful with empathy): \ No newline at end of file diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json index 2310d05b5..5ad52699f 100644 --- a/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json +++ b/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json @@ -1,187 +1,133 @@ { - "Measure 12 ounces of cold water and transfer to a kettle. Boil the water.": { + "Measure 12 ounces of water in the liquid measuring cup.": { "level": 0, "index": 0, - "sub-steps": { - "Measure 12 ounces of water in the liquid measuring cup": { - "level": 1, - "index": 0, - "activity": "Measure 12 ounces of water in the liquid measuring cup" - }, - "Pour the water from the liquid measuring cup into the electric kettle": { - "level": 1, - "index": 1, - "activity": "Pour the water from the liquid measuring cup into the electric kettle" - }, - "Turn on the Kettle": { - "level": 1, - "index": 2, - "activity": "Turn on the Kettle" - } - } - }, - - "While the water is boiling, place the dripper on top of a coffee mug.": { + "sub-steps": {} + }, + + "Pour the water from the liquid measuring cup into the electric kettle.": { "level": 0, "index": 1, - "sub-steps": { - "Place the dripper on top of the mug": { - "level": 1, - "index": 0, - "activity": "Place the dripper on top of the mug" - } - } + "sub-steps": {} }, - "Prepare the filter insert by folding the paper filter in half to create a semi-circle, and in half again to create a quarter-circle. Place the paper filter in the dripper and spread open to create a cone.": { + "Turn on the Kettle.": { "level": 0, "index": 2, - "sub-steps": { - "Take the coffee filter and fold it in half to create a semi-circle": { - "level": 1, - "index": 0, - "activity": "Take the coffee filter and fold it in half to create a semi-circle" - }, - "Fold the filter in half again to create a quarter-circle": { - "level": 1, - "index": 1, - "activity": "Fold the filter in half again to create a quarter-circle" - }, - "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper": { - "level": 1, - "index": 2, - "activity": "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper" - }, - "Spread the filter open to create a cone inside the dripper": { - "level": 1, - "index": 3, - "activity": "Spread the filter open to create a cone inside the dripper" - } - } - }, - - "Weigh the coffee beans and grind until the coffee grounds are the consistency of coarse sand, about 20 seconds. Transfer the grounds to the filter cone.": { + "sub-steps": {} + }, + + "Place the dripper on top of the mug.": { "level": 0, "index": 3, - "sub-steps": { - "Turn on the kitchen scale": { - "level": 1, - "index": 0, - "activity": "Turn on the kitchen scale" - }, - "Place a bowl on the scale": { - "level": 1, - "index": 1, - "activity": "Place a bowl on the scale" - }, - "Zero the scale": { - "level": 1, - "index": 2, - "activity": "Zero the scale" - }, - "Add coffee beans to the bowl until the scale reads 25 grams": { - "level": 1, - "index": 3, - "activity": "Add coffee beans to the bowl until the scale reads 25 grams" - }, - "Pour the measured coffee beans into the coffee grinder": { - "level": 1, - "index": 4, - "activity": "Pour the measured coffee beans into the coffee grinder" - }, - "Set timer for 20 seconds": { - "level": 1, - "index": 5, - "activity": "Set timer for 20 seconds" - }, - "Turn on the timer": { - "level": 1, - "index": 6, - "activity": "Turn on the timer" - }, - "Grind the coffee beans by pressing and holding down on the black part of the lid": { - "level": 1, - "index": 7, - "activity": "Grind the coffee beans by pressing and holding down on the black part of the lid" - }, - "Pour the grounded coffee beans into the filter cone prepared in step 2": { - "level": 1, - "index": 8, - "activity": "Pour the grounded coffee beans into the filter cone prepared in step 2" - } - } - }, - - "Check the temperature.": { + "sub-steps": {} + }, + + "Take the coffee filter and fold it in half to create a semi-circle.": { "level": 0, "index": 4, - "sub-steps": { - "Turn on the thermometer": { - "level": 1, - "index": 0, - "activity": "Turn on the thermometer" - }, - "Place the end of the thermometer into the water": { - "level": 1, - "index": 1, - "activity": "Place the end of the thermometer into the water" - } - } - }, - - "Pour a small amount of water in the filter to wet the grounds. Wait about 30 seconds.": { + "sub-steps": {} + }, + + "Fold the filter in half again to create a quarter-circle.": { "level": 0, "index": 5, - "sub-steps": { - "Set timer to 30 seconds": { - "level": 1, - "index": 0, - "activity": "Set timer to 30 seconds" - }, - "Pour a small amount of water over the grounds in order to wet the grounds": { - "level": 1, - "index": 1, - "activity": "Pour a small amount of water over the grounds in order to wet the grounds" - } - } - }, - - "Slowly pour the rest of the water over the grounds in a circular motion. Do not overfill beyond the top of the paper filter.": { + "sub-steps": {} + }, + + "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper.": { "level": 0, "index": 6, - "sub-steps": { - "Slowly pour the water over the grounds in a circular motion. Do not overfill beyond the top of the paper filter": { - "level": 1, - "index": 0, - "activity": "Slowly pour the water over the grounds in a circular motion. Do not overfill beyond the top of the paper filter" - } - } + "sub-steps": {} }, - "Let the coffee drain completely into the mug before removing the dripper. Discard the paper filter and coffee grounds.": { + "Spread the filter open to create a cone inside the dripper.": { "level": 0, "index": 7, - "sub-steps": { - "Allow the rest of the water in the dripper to drain": { - "level": 1, - "index": 0, - "activity": "Allow the rest of the water in the dripper to drain" - }, - "Remove the dripper from the cup": { - "level": 1, - "index": 1, - "activity": "Remove the dripper from the cup" - }, - "Remove the coffee grounds and paper filter from the dripper": { - "level": 1, - "index": 2, - "activity": "Remove the coffee grounds and paper filter from the dripper" - }, - "Discard the coffee grounds and paper filter": { - "level": 1, - "index": 3, - "activity": "Discard the coffee grounds and paper filter" - } - } + "sub-steps": {} + }, + + "Place a bowl on the scale.": { + "level": 0, + "index": 8, + "sub-steps": {} + }, + + "Turn on the kitchen scale and zero the scale.": { + "level": 0, + "index": 9, + "sub-steps": {} + }, + + "Add coffee beans to the bowl until the scale reads 25 grams.": { + "level": 0, + "index": 10, + "sub-steps": {} + }, + + "Pour the measured coffee beans into the coffee grinder.": { + "level": 0, + "index": 11, + "sub-steps": {} + }, + + "Grind the coffee beans by pressing and holding down on the black part of the lid.": { + "level": 0, + "index": 12, + "sub-steps": {} + }, + + "Transfer the grounds to the filter cone.": { + "level": 0, + "index": 13, + "sub-steps": {} + }, + + "Turn on the thermometer.": { + "level": 0, + "index": 14, + "sub-steps": {} + }, + + "Place the end of the thermometer into the water.": { + "level": 0, + "index": 15, + "sub-steps": {} + }, + + "Slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter.": { + "level": 0, + "index": 16, + "sub-steps": {} + }, + + "Continue slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter...": { + "level": 0, + "index": 17, + "sub-steps": {} + }, + + "Allow the rest of the water in the dripper to drain.": { + "level": 0, + "index": 18, + "sub-steps": {} + }, + + "Remove the dripper from the cup.": { + "level": 0, + "index": 19, + "sub-steps": {} + }, + + "Remove the coffee grounds and paper filter from the dripper.": { + "level": 0, + "index": 20, + "sub-steps": {} + }, + + "Discard the coffee grounds and paper filter.": { + "level": 0, + "index": 21, + "sub-steps": {} } } diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json new file mode 100644 index 000000000..c0bdedaed --- /dev/null +++ b/ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json @@ -0,0 +1,67 @@ +{ + "Place tortilla on cutting board": { + "level": 0, + "index": 0, + "sub-steps": {} + }, + + "Use the butter knife to scoop nutella from the jar.": { + "level": 0, + "index": 1, + "sub-steps": {} + }, + + "Spread nutella onto tortilla, leaving ½ inch uncovered at the edges.": { + "level": 0, + "index": 2, + "sub-steps": {} + }, + + "Clean the knife by wiping with a paper towel": { + "level": 0, + "index": 3, + "sub-steps": {} + }, + + "Slice one banana.": { + "level": 0, + "index": 4, + "sub-steps": {} + }, + + "Top with banana slices.": { + "level": 0, + "index": 5, + "sub-steps": {} + }, + + "Clean the knife by wiping with a paper towel ": { + "level": 0, + "index": 6, + "sub-steps": {} + }, + + "Sprinkle small amount of cinnamon onto tortilla.": { + "level": 0, + "index": 7, + "sub-steps": {} + }, + + "Fold tortilla in half into semi-cirlce": { + "level": 0, + "index": 8, + "sub-steps": {} + }, + + "Slice tortilla in half using butter knife to create two triangular wedges.": { + "level": 0, + "index": 9, + "sub-steps": {} + }, + + "Place tortilla wedge on the plate.": { + "level": 0, + "index": 10, + "sub-steps": {} + } +} \ No newline at end of file diff --git a/ros/angel_system_nodes/setup.py b/ros/angel_system_nodes/setup.py index f8c4682f7..06ad5bcd3 100644 --- a/ros/angel_system_nodes/setup.py +++ b/ros/angel_system_nodes/setup.py @@ -25,7 +25,7 @@ "base_emotion_detector = angel_system_nodes.base_emotion_detector:main", "gpt_emotion_detector = angel_system_nodes.gpt_emotion_detector:main", "question_answerer = angel_system_nodes.question_answerer:main", - "intent_detector = angel_system_nodes.intent_detector:main", + "visual_question_answerer = angel_system_nodes.visual_question_answerer:main", "spatial_mapper = angel_system_nodes.spatial_mapper:main", "feedback_generator = angel_system_nodes.feedback_generator:main", "annotation_event_monitor = angel_system_nodes.annotation_event_monitor:main", diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml new file mode 100644 index 000000000..7d7f84282 --- /dev/null +++ b/tmux/Nov23-voice-live.yml @@ -0,0 +1,224 @@ +# +# System configuration to run the ANGEL system for the 2022/11 PI meeting and +# Evaluation 1. +# + +name: 2023-10-eval-live +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} + export HL2_IP=${HL2_IP:-172.20.10.12} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs + export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BAGS_DIR=${ANGEL_WORKSPACE_DIR}/ros_bags + #export RMW_IMPLEMENTATION=rmw_cyclonedds_cpp + + # Changing the domain ID was important at KHQ to unblock perceived network + # congestion slowdowns to message sending. + export ROS_DOMAIN_ID=77 + + # Set the frame-rate to be used by multiple sources. This should be in frames + # per second (Hz). + export FRAME_RATE=15 + +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +windows: + - sensor_input: + layout: even-vertical + panes: + - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ROS_IP:=0.0.0.0 + - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ip_addr:=${HL2_IP} + -p image_topic:=PVFramesBGR + -p image_ts_topic:=PVFramesBGR_TS + -p hand_pose_topic:=disable + -p audio_topic:=HeadsetAudioData + -p sm_topic:=disable + -p head_pose_topic:=disable + -p pv_width:=1280 + -p pv_height:=720 + -p pv_framerate:=${FRAME_RATE} + -p sm_freq:=5 + -p rm_depth_AHAT:=disable + + # Visualize RGB Images being output from the headset + #- rqt_rgb_images: rqt -s rqt_image_view/ImageView + # --args ${ROS_NAMESPACE}/PVFramesBGR + # --ros-args -p _image_transport:=raw + + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://localhost:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=4 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://localhost:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - intent_detection: + layout: even-vertical + panes: + - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p expect_user_intent_topic:=expect_user_intent_topic + -p interp_user_intent_topic:=interp_user_intent_topic + -p timeout:=2 + - emotion_detection: + layout: even-vertical + panes: +# - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=utterances_topic +# -p user_emotion_topic:=base_emotion_topic +# - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=utterances_topic +# -p user_emotion_topic:=gpt_emotion_topic +# -p timeout:=2 + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=interp_user_intent_topic + -p user_emotion_topic:=emotion_topic + -p timeout:=2 + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --log-level visual_question_answerer:=DEBUG --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=emotion_topic + -p task_state_topic:=TaskUpdates + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${NODE_CONFIG_DIR}/mit_ll_eval_one_quesadilla_recipe_steps.json + -p prompt_template_path:=${NODE_CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p pv_width:=1920 + -p pv_height:=1080 + -p obj_det_last_n:=8 + -p must_contain_target_phrase:=True + -p object_det_ignored_objects:="hand (left),hand (right),microwave (closed),microwave (open),background,trash can,peanut butter,nut butter jar lid,nut butter jar (open),nut butter jar (closed),jelly jar lid,jelly jar (open),jelly jar (closed),butter knife + nut butter,butter knife + jelly,tortilla + nut butter,tortilla + jelly" + -p debug_mode:=True + + - object_detector: + layout: even-vertical + panes: + - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p net_checkpoint:=${MODEL_DIR}/all_recipes+additional_objs+bkgd_yolov7.pt + -p inference_img_size:=1280 + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/yolo_all_recipes_additional_objs_bkgd_sample_rate_2.ckpt + -p model_mapping:=${MODEL_DIR}/yolo_all_recipes_additional_objs_bkgd_act_mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-all_activities-det_label_mapping.json + -p model_device:=cuda + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + + - keyboard_sys_cmd: ros2 run angel_system_nodes keyboard_to_sys_cmd --ros-args + -r __ns:=${ROS_NAMESPACE} + -p system_command_topic:=SystemCommands + + - task_monitor: ros2 run angel_system_nodes global_step_predictor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p det_topic:=ActivityDetections + -p model_file:=${MODEL_DIR}/global_step_predictor_act_avgs_all_classes_v2.0_sample_rate_2.npy + -p threshold_multiplier_weak:=0.05 + -p thresh_frame_count:=$((8 / (30 / ${FRAME_RATE}))) + -p threshold_frame_count_weak:=2 + -p deactivate_thresh_frame_count:=$((20 / (30 / ${FRAME_RATE}))) + -p step_mode:=granular + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=TaskUpdates + -p query_task_graph_topic:=query_task_graph + -p task_error_topic:=TaskErrors + -p system_command_topic:=SystemCommands + #-p gt_activity_mscoco:=model_files/test_activity_preds.mscoco.json + #-p gt_video_id:=8 + #-p gt_output_dir:="${BAGS_DIR}" + + - feedback_generator: ros2 run angel_system_nodes feedback_generator --ros-args + -r __ns:=${ROS_NAMESPACE} + -p activity_detector_topic:=ActivityDetections + -p object_detection_topic:=ObjectDetections3d + -p task_monitor_topic:=TaskUpdates + -p arui_update_topic:=AruiUpdates + -p utterances_topic:=utterances_topic + -p interp_user_intent_topic:=InterpUserIntents + -p system_text_response_topic:=system_text_response_topic + + - engineering-ui: + layout: even-vertical + panes: + - engineering_ui_websocket: ros2 launch rosbridge_server rosbridge_websocket_launch.xml port:=9090 + - engineering_ui_server: node src/angel_utils/multi_task_demo_ui/index.js + --namespace=${ROS_NAMESPACE} + --image_topic=pv_image_detections_2d/compressed + --query_task_graph_topic=query_task_graph + --task_updates_topic=TaskUpdates + --activity_detections_topic=ActivityDetections + --task_errors_topic=TaskErrors diff --git a/tmux/demos/2023-10-eval_prep-live.yml b/tmux/demos/2023-10-eval_prep-live.yml index fa65b4e5e..1c561f034 100644 --- a/tmux/demos/2023-10-eval_prep-live.yml +++ b/tmux/demos/2023-10-eval_prep-live.yml @@ -18,7 +18,7 @@ root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # on_project_start: command on_project_start: | export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} - export HL2_IP=${HL2_IP:-192.168.1.4} + export HL2_IP=${HL2_IP:-192.168.4.65} export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs diff --git a/tmux/eval_asr.yml b/tmux/eval_asr.yml deleted file mode 100644 index 5c439bfc3..000000000 --- a/tmux/eval_asr.yml +++ /dev/null @@ -1,69 +0,0 @@ -# -# Used to evaluate ASR and intent detection for a specified ROS bag of data. -# This configuration should be run by itself (e.g. not in combination with -# another tmuxinator launch). -# - -name: ASR Evaluation -root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> - -# Optional tmux socket -# socket_name: foo - -# Note that the pre and post options have been deprecated and will be replaced by -# project hooks. - -# Project hooks - -# Runs on project start, always -# on_project_start: command -on_project_start: | - export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} - export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs - -# Run on project start, the first time -# on_project_first_start: command - -# Run on project start, after the first time -# on_project_restart: command - -# Run on project exit ( detaching from tmux session ) -# on_project_exit: command - -# Run on project stop -# on_project_stop: command - -# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. -# pre_window: rbenv shell 2.0.0-p247 - -# Pass command line options to tmux. Useful for specifying a different tmux.conf. -# tmux_options: -f ~/.tmux.mac.conf -tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf - -# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. -# tmux_command: byobu - -# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. -# startup_window: editor - -# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. -# startup_pane: 1 - -# Controls whether the tmux session should be attached to automatically. Defaults to true. -# attach: false - -windows: - # - ros_bag_play: ros2 bag play <> - - ros_bag_play: ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 - - asr: ros2 run angel_system_nodes asr --ros-args - -r __ns:=${ROS_NAMESPACE} - -p audio_topic:=HeadsetAudioData - -p utterances_topic:=utterances_topic - -p asr_server_url:=http://communication.cs.columbia.edu:8058/asr - -p asr_req_segment_duration:=30 - - intent_detection: ros2 run angel_system_nodes intent_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml new file mode 100644 index 000000000..2fba7790c --- /dev/null +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -0,0 +1,199 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.0.23} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: + - sensor_input: + layout: even-vertical + panes: + - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ROS_IP:=0.0.0.0 + - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ip_addr:=${HL2_IP} + -p image_topic:=PVFramesRGB + -p image_ts_topic:=PVFramesRGB_TS + -p hand_pose_topic:=disable + -p audio_topic:=HeadsetAudioData + -p sm_topic:=disable + -p head_pose_topic:=disable + -p pv_width:=1280 + -p pv_height:=720 + -p pv_framerate:=30 + -p sm_freq:=5 + -p rm_depth_AHAT:=disable + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesRGB + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesRGB + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesRGB + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesRGB_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:0 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - intent_detection: + layout: even-vertical + panes: + - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p expect_user_intent_topic:=expect_user_intent_topic + -p interp_user_intent_topic:=interp_user_intent_topic + -p timeout:=2 + - emotion_detection: + layout: even-vertical + panes: + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=interp_user_intent_topic + -p user_emotion_topic:=emotion_topic + -p timeout:=2 + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --log-level visual_question_answerer:=DEBUG --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=emotion_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p pv_width:=1920 + -p pv_height:=1080 + -p obj_det_last_n:=8 + -p object_det_ignored_objects:="hand (left),hand (right),background" + -p must_contain_target_phrase:=False + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_barebones_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_barebones_question_answering.yml new file mode 100644 index 000000000..d9065b93a --- /dev/null +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_barebones_question_answering.yml @@ -0,0 +1,196 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.1.101} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: +# - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ROS_IP:=0.0.0.0 +# - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ip_addr:=${HL2_IP} +# -p image_topic:=PVFramesBGR +# -p image_ts_topic:=disable +# -p hand_pose_topic:=disable +# -p audio_topic:=HeadsetAudioData +# -p head_pose_topic:=HeadsetPoseData +# -p sm_topic:=disable +# -p rm_depth_AHAT:=disable +# -p pv_width:=760 +# -p pv_height:=428 +# -p pv_framerate:=30 +# -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + - ros_bag_play: sleep 2; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:0 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True +# - intent_detection: +# layout: even-vertical +# panes: +# - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=utterances_topic +# -p expect_user_intent_topic:=expect_user_intent_topic +# -p interp_user_intent_topic:=interp_user_intent_topic +# - emotion_detection: +# layout: even-vertical +# panes: +# - emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=interp_user_intent_topic +# -p user_emotion_topic:=emotion_topic + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=utterances_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_last_n:=5 + -p pv_width:=1920 + -p pv_height:=1080 + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml new file mode 100644 index 000000000..225ba1387 --- /dev/null +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml @@ -0,0 +1,190 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.1.101} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: +# - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ROS_IP:=0.0.0.0 +# - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ip_addr:=${HL2_IP} +# -p image_topic:=PVFramesBGR +# -p image_ts_topic:=disable +# -p hand_pose_topic:=disable +# -p audio_topic:=HeadsetAudioData +# -p head_pose_topic:=HeadsetPoseData +# -p sm_topic:=disable +# -p rm_depth_AHAT:=disable +# -p pv_width:=760 +# -p pv_height:=428 +# -p pv_framerate:=30 +# -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + - ros_bag_play: sleep 2; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:0 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - emotion_detection: + layout: even-vertical + panes: + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=emotion_topic + -p timeout:=2 + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=emotion_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_last_n:=5 + -p pv_width:=1920 + -p pv_height:=1080 + -p object_det_ignored_objects:="hand (left),hand (right),background" + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml new file mode 100644 index 000000000..97982a035 --- /dev/null +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml @@ -0,0 +1,200 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.1.101} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: +# - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ROS_IP:=0.0.0.0 +# - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ip_addr:=${HL2_IP} +# -p image_topic:=PVFramesBGR +# -p image_ts_topic:=disable +# -p hand_pose_topic:=disable +# -p audio_topic:=HeadsetAudioData +# -p head_pose_topic:=HeadsetPoseData +# -p sm_topic:=disable +# -p rm_depth_AHAT:=disable +# -p pv_width:=760 +# -p pv_height:=428 +# -p pv_framerate:=30 +# -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + - ros_bag_play: sleep 2; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:0 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - intent_detection: + layout: even-vertical + panes: + - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p expect_user_intent_topic:=expect_user_intent_topic + -p interp_user_intent_topic:=interp_user_intent_topic + -p timeout:=2 + - emotion_detection: + layout: even-vertical + panes: + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=interp_user_intent_topic + -p user_emotion_topic:=emotion_topic + -p timeout:=2 + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --log-level visual_question_answerer:=DEBUG --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=emotion_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p pv_width:=1920 + -p pv_height:=1080 + -p obj_det_last_n:=8 + -p object_det_ignored_objects:="hand (left),hand (right),background" + -p must_contain_target_phrase:=False + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/eval_vocal.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocal.yml similarity index 100% rename from tmux/eval_vocal.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocal.yml diff --git a/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_emotion_detection.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_emotion_detection.yml new file mode 100644 index 000000000..b88684fb8 --- /dev/null +++ b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_emotion_detection.yml @@ -0,0 +1,94 @@ +# +# Used to evaluate Emotion Detection with vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: VAD + ASR + Emotion Detection +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export NODE_RESOURCES_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/resource +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: + # - ros_bag_play: ros2 bag play <> + - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=3 + -p vad_margin:=0.20 + -p max_accumulation_length:=10 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=1 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - emotion_detection: + layout: even-vertical + panes: + - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=base_emotion_topic + - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=gpt_emotion_topic \ No newline at end of file diff --git a/tmux/eval_vocalized_intent_detection.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_detection.yml similarity index 94% rename from tmux/eval_vocalized_intent_detection.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_detection.yml index 559557e8a..bf160dc8f 100644 --- a/tmux/eval_vocalized_intent_detection.yml +++ b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_detection.yml @@ -10,7 +10,7 @@ # ``` # -name: Intent Detection with VAD +name: VAD + ASR + Intent Detection root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # Optional tmux socket @@ -60,7 +60,7 @@ tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf windows: # - ros_bag_play: ros2 bag play <> - - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00_0.db3 + - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 - vocal: layout: even-vertical panes: @@ -86,11 +86,11 @@ windows: panes: - base_intent_detection: ros2 run angel_system_nodes base_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic \ No newline at end of file diff --git a/tmux/eval_vocalized_emotional_detection.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_emotion_detection.yml similarity index 81% rename from tmux/eval_vocalized_emotional_detection.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_emotion_detection.yml index f16d448a3..ae24dde32 100644 --- a/tmux/eval_vocalized_emotional_detection.yml +++ b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_emotion_detection.yml @@ -10,7 +10,7 @@ # ``` # -name: ASR Evaluation with VAD +name: VAD + ASR + Intent Detection + Emotion Detection root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # Optional tmux socket @@ -84,26 +84,15 @@ windows: - intent_detection: layout: even-vertical panes: - - base_intent_detection: ros2 run angel_system_nodes base_intent_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic - emotion_detection: layout: even-vertical panes: - - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - -p user_emotion_topic:=base_emotion_topic - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic + -p input_topic:=interp_user_intent_topic -p user_emotion_topic:=gpt_emotion_topic \ No newline at end of file diff --git a/tmux/eval_vocalized_question_answering.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_question_answering.yml similarity index 77% rename from tmux/eval_vocalized_question_answering.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_question_answering.yml index 3fd3c8dbb..b61ef88f1 100644 --- a/tmux/eval_vocalized_question_answering.yml +++ b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_question_answering.yml @@ -1,16 +1,4 @@ -# -# Used to evaluate Question Answering with vocal processing for a specified ROS bag of data -# This configuration should be run by itself (e.g. not in combination with -# another tmuxinator launch). -# -# NOTE: In order to query GPT, you will need to execute -# ``` -# export OPENAI_API_KEY="YOUR API KEY" -# export OPENAI_ORG_ID="YOUR ORG ID" -# ``` -# - -name: ASR Evaluation with VAD +name: Vocal Question Answering root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # Optional tmux socket @@ -81,27 +69,18 @@ windows: -p asr_req_segment_duration:=1 -p is_sentence_tokenize:=False -p debug_mode:=True - - intent_detection: - layout: even-vertical - panes: - - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - emotion_detection: layout: even-vertical panes: - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic + -p input_topic:=utterances_topic -p user_emotion_topic:=gpt_emotion_topic - question_answering: layout: even-vertical panes: - gpt_question_answering: ros2 run angel_system_nodes question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=gpt_emotion_topic -p system_text_response_topic:=system_text_response_topic -p few_shot_prompt_file:=${CONFIG_DIR}/llm_prompts/tourniquet_steps_prompt diff --git a/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs new file mode 100644 index 000000000..b1e86db02 --- /dev/null +++ b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs @@ -0,0 +1,99 @@ +//Do not edit! This file was generated by Unity-ROS MessageGeneration. +using System; +using System.Linq; +using System.Collections.Generic; +using System.Text; +using Unity.Robotics.ROSTCPConnector.MessageGeneration; + +namespace RosMessageTypes.Angel +{ + [Serializable] + public class DialogueUtteranceMsg : Message + { + public const string k_RosMessageName = "angel_msgs/DialogueUtterance"; + public override string RosMessageName => k_RosMessageName; + + // + // Dialogue Utterance with additional information about the environmental state + // and user model. + // + // The header primarily encapsulates when this message was emitted. + // The time component of this may be utilized as an identifier for this user + // intent and utterance. + public Std.HeaderMsg header; + // Speech-to-text of the user utterance we have interpreted + public string utterance_text; + // Below are optional fields + // Canonical user intent that has been interpreted. "Canonical" in this context + // is to mean that this string may be used as an identifier of this type of + // user intent. Should be in the range [0,1] where 1.0 means absolute confidence. + public string intent; + public double intent_confidence_score; + // Emotion classification. Should be in the range [0,1] where 1.0 means absolute confidence. + public string emotion; + public double emotion_confidence_score; + + public DialogueUtteranceMsg() + { + this.header = new Std.HeaderMsg(); + this.utterance_text = ""; + this.intent = ""; + this.intent_confidence_score = 0.0; + this.emotion = ""; + this.emotion_confidence_score = 0.0; + } + + public DialogueUtteranceMsg(Std.HeaderMsg header, string utterance_text, string intent, double intent_confidence_score, string emotion, double emotion_confidence_score) + { + this.header = header; + this.utterance_text = utterance_text; + this.intent = intent; + this.intent_confidence_score = intent_confidence_score; + this.emotion = emotion; + this.emotion_confidence_score = emotion_confidence_score; + } + + public static DialogueUtteranceMsg Deserialize(MessageDeserializer deserializer) => new DialogueUtteranceMsg(deserializer); + + private DialogueUtteranceMsg(MessageDeserializer deserializer) + { + this.header = Std.HeaderMsg.Deserialize(deserializer); + deserializer.Read(out this.utterance_text); + deserializer.Read(out this.intent); + deserializer.Read(out this.intent_confidence_score); + deserializer.Read(out this.emotion); + deserializer.Read(out this.emotion_confidence_score); + } + + public override void SerializeTo(MessageSerializer serializer) + { + serializer.Write(this.header); + serializer.Write(this.utterance_text); + serializer.Write(this.intent); + serializer.Write(this.intent_confidence_score); + serializer.Write(this.emotion); + serializer.Write(this.emotion_confidence_score); + } + + public override string ToString() + { + return "DialogueUtteranceMsg: " + + "\nheader: " + header.ToString() + + "\nutterance_text: " + utterance_text.ToString() + + "\nintent: " + intent.ToString() + + "\nintent_confidence_score: " + intent_confidence_score.ToString() + + "\nemotion: " + emotion.ToString() + + "\nemotion_confidence_score: " + emotion_confidence_score.ToString(); + } + +#if UNITY_EDITOR + [UnityEditor.InitializeOnLoadMethod] +#else + [UnityEngine.RuntimeInitializeOnLoadMethod] +#endif + public static void Register() + { + MessageRegistry.Register(k_RosMessageName, Deserialize); + } + } +} diff --git a/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta new file mode 100644 index 000000000..cfee2a66a --- /dev/null +++ b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 244f6af8d6d7e4c18a6e2d52b444d387 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: