Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
install:
@echo "--- 🚀 Installing project dependencies ---"
uv pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/webarenalite -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
uv pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/webarenalite -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/timewarp -e ./browsergym/
uv run playwright install chromium

install-demo:
Expand Down
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ BrowserGym includes the following benchmarks by default:
- [AssistantBench](https://github.com/oriyor/assistantbench)
- [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
- [OpenApps](https://facebookresearch.github.io/OpenApps/)
- TimeWarp

Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.

Expand All @@ -62,6 +63,7 @@ pip install browsergym-visualwebarena # core + visualwebarena
pip install browsergym-workarena # core + workarena
pip install browsergym-assistantbench # core + assistantbench
pip install weblinx-browsergym # core + weblinx
pip install browsergym-timewarp # core + timewarp
```

Then setup playwright by running
Expand All @@ -77,6 +79,7 @@ Finally, each benchmark comes with its own specific setup that requires to follo
- for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
- for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
- for OpenApps, see [OpenApps docs](https://facebookresearch.github.io/OpenApps/)
- for TimeWarp, see [timewarp/README.md](browsergym/timewarp/README.md)

### 🏗️ Development setup

Expand Down Expand Up @@ -194,6 +197,20 @@ launcher = OpenAppsLauncher(config)
launcher.launch()
```

TimeWarp
```python
import gymnasium as gym
import browsergym.timewarp # register timewarp tasks as gym environments

# start a timewarp task
env = gym.make("browsergym/timewarp.1")
...

# list all the available timewarp tasks
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/timewarp")]
print("\n".join(env_ids))
```

## 💻 Demo

If you want to experiment with a demo agent in BrowserGym, follow these steps
Expand Down
14 changes: 14 additions & 0 deletions browsergym/core/src/browsergym/core/action/highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,19 @@
goto, # GOTO, SEARCH
send_msg_to_user, # TERMINATE
],
"timewarp": [ #Starting from an action set identical to assistantbench
scroll, # SCROLL
fill, # TYPE
click, # CLICK
press, # PRESS ENTER
go_back, # GOBACK
goto, # GOTO, SEARCH
send_msg_to_user, # TERMINATE, sends message to user
report_infeasible, # explicit unachievable action, equivalent STOP "N/A"
new_tab, # Creates a new tab
tab_close, #Closes a tab
tab_focus, #Switches tabs
],
}


Expand Down Expand Up @@ -277,6 +290,7 @@ class HighLevelActionSet(AbstractActionSet):
"workarena++",
"weblinx",
"assistantbench",
"timewarp",
"custom",
]
DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def make_action_set(self):
"workarena",
"assistantbench",
"weblinx",
"timewarp"
]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@
retry_with_force=True,
demo_mode="off",
),
"timewarp": HighLevelActionSetArgs(
subsets=["timewarp"],
multiaction=False,
strict=False,
retry_with_force=True,
demo_mode="off",
),
}

# all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()`
Expand Down Expand Up @@ -291,4 +298,18 @@
),
task_metadata=task_metadata("weblinx"),
),
"timewarp": lambda n_repeats=1: Benchmark(
name="timewarp",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["timewarp"],
is_multi_tab=True,
supports_parallel_seeds=True,
backends=["timewarp"],
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("timewarp")),
max_steps=30,
n_repeats=n_repeats,
seeds_rng=np.random.RandomState(42),
),
task_metadata=task_metadata("timewarp"),
),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
task_name,sites,eval_types,task_id,browsergym_split
timewarp.1,wiki,llm_judge,1,test
timewarp.2,wiki,llm_judge,2,test
timewarp.3,wiki,llm_judge,3,test
timewarp.4,wiki,llm_judge,4,test
timewarp.5,wiki,llm_judge,5,test
timewarp.6,wiki,llm_judge,6,test
timewarp.7,wiki,llm_judge,7,test
timewarp.8,wiki,llm_judge,8,test
timewarp.9,wiki,llm_judge,9,test
timewarp.10,wiki,llm_judge,10,test
timewarp.11,wiki,llm_judge,11,test
timewarp.12,wiki,llm_judge,12,test
timewarp.13,wiki,llm_judge,13,test
timewarp.14,wiki,llm_judge,14,test
timewarp.15,wiki,llm_judge,15,test
timewarp.16,wiki,llm_judge,16,test
timewarp.17,wiki,llm_judge,17,test
timewarp.18,wiki,llm_judge,18,test
timewarp.19,wiki,llm_judge,19,test
timewarp.20,wiki,llm_judge,20,test
timewarp.21,wiki,llm_judge,21,test
timewarp.22,wiki,llm_judge,22,test
timewarp.23,wiki,llm_judge,23,test
timewarp.24,wiki,llm_judge,24,test
timewarp.25,wiki,llm_judge,25,test
timewarp.26,wiki,llm_judge,26,test
timewarp.27,wiki,llm_judge,27,test
timewarp.28,wiki,llm_judge,28,test
timewarp.29,wiki,llm_judge,29,test
timewarp.30,wiki,llm_judge,30,test
timewarp.31,wiki,llm_judge,31,test
timewarp.32,news,llm_judge,32,test
timewarp.33,news,llm_judge,33,test
timewarp.34,news,llm_judge,34,test
timewarp.35,news,llm_judge,35,test
timewarp.36,news,llm_judge,36,test
timewarp.37,news,llm_judge,37,test
timewarp.38,news,llm_judge,38,test
timewarp.39,news,llm_judge,39,test
timewarp.40,news,llm_judge,40,test
timewarp.41,news,llm_judge,41,test
timewarp.42,news,llm_judge,42,test
timewarp.43,news,llm_judge,43,test
timewarp.44,news,llm_judge,44,test
timewarp.45,news,llm_judge,45,test
timewarp.46,news,llm_judge,46,test
timewarp.47,news,llm_judge,47,test
timewarp.48,news,llm_judge,48,test
timewarp.49,news,llm_judge,49,test
timewarp.50,news,llm_judge,50,test
timewarp.51,news,llm_judge,51,test
timewarp.52,news,llm_judge,52,test
timewarp.53,news,llm_judge,53,test
timewarp.54,webshop,llm_judge,54,test
timewarp.55,webshop,llm_judge,55,test
timewarp.56,webshop,llm_judge,56,test
timewarp.57,webshop,llm_judge,57,test
timewarp.58,webshop,llm_judge,58,test
timewarp.59,webshop,llm_judge,59,test
timewarp.60,webshop,llm_judge,60,test
timewarp.61,webshop,llm_judge,61,test
timewarp.62,webshop,llm_judge,62,test
timewarp.63,webshop,llm_judge,63,test
timewarp.64,webshop,llm_judge,64,test
timewarp.65,webshop,llm_judge,65,test
timewarp.66,webshop,llm_judge,66,test
timewarp.67,webshop,llm_judge,67,test
timewarp.68,webshop,llm_judge,68,test
timewarp.69,webshop,llm_judge,69,test
timewarp.70,webshop,llm_judge,70,test
timewarp.71,webshop,llm_judge,71,test
timewarp.72,webshop,llm_judge,72,test
timewarp.73,webshop,llm_judge,73,test
timewarp.74,webshop,llm_judge,74,test
timewarp.75,webshop,llm_judge,75,test
timewarp.76,webshop,llm_judge,76,test
timewarp.77,webshop,llm_judge,77,test
timewarp.78,webshop,llm_judge,78,test
timewarp.79,webshop,llm_judge,79,test
timewarp.80,webshop,llm_judge,80,test
timewarp.81,"wiki,news",llm_judge,81,test
timewarp.82,"wiki,news",llm_judge,82,test
timewarp.83,"wiki,webshop",llm_judge,83,test
timewarp.84,"wiki,news,webshop",llm_judge,84,test
timewarp.85,"news,webshop",llm_judge,85,test
timewarp.86,"wiki,news",llm_judge,86,test
timewarp.87,"wiki,webshop",llm_judge,87,test
timewarp.88,"news,webshop",llm_judge,88,test
timewarp.89,"wiki,webshop",llm_judge,89,test
timewarp.90,"wiki,webshop",llm_judge,90,test
timewarp.91,"wiki,news",llm_judge,91,test
timewarp.92,"wiki,news",llm_judge,92,test
timewarp.93,"wiki,news",llm_judge,93,test
timewarp.94,"wiki,news,webshop",llm_judge,94,test
timewarp.95,"wiki,news,webshop",llm_judge,95,test
timewarp.96,"news,webshop",llm_judge,96,test
timewarp.97,"wiki,news,webshop",llm_judge,97,test
timewarp.98,"wiki,news",llm_judge,98,test
timewarp.99,"wiki,news,webshop",llm_judge,99,test
timewarp.100,"wiki,news,webshop",llm_judge,100,test
timewarp.101,"wiki,webshop",llm_judge,101,test
timewarp.102,"wiki,news",llm_judge,102,test
timewarp.103,"wiki,news",llm_judge,103,test
timewarp.104,wiki,llm_judge,104,train
timewarp.105,wiki,llm_judge,105,train
timewarp.106,wiki,llm_judge,106,train
timewarp.107,wiki,llm_judge,107,train
timewarp.108,wiki,llm_judge,108,train
timewarp.109,wiki,llm_judge,109,train
timewarp.110,wiki,llm_judge,110,train
timewarp.111,wiki,llm_judge,111,train
timewarp.112,wiki,llm_judge,112,train
timewarp.113,wiki,llm_judge,113,train
timewarp.114,wiki,llm_judge,114,train
timewarp.115,wiki,llm_judge,115,train
timewarp.116,wiki,llm_judge,116,train
timewarp.117,wiki,llm_judge,117,train
timewarp.118,wiki,llm_judge,118,train
timewarp.119,wiki,llm_judge,119,train
timewarp.120,wiki,llm_judge,120,train
timewarp.121,wiki,llm_judge,121,train
timewarp.122,wiki,llm_judge,122,train
timewarp.123,wiki,llm_judge,123,train
timewarp.124,wiki,llm_judge,124,train
timewarp.125,wiki,llm_judge,125,train
timewarp.126,wiki,llm_judge,126,train
timewarp.127,wiki,llm_judge,127,train
timewarp.128,wiki,llm_judge,128,train
timewarp.129,wiki,llm_judge,129,train
timewarp.130,wiki,llm_judge,130,train
timewarp.131,wiki,llm_judge,131,train
timewarp.132,wiki,llm_judge,132,train
timewarp.133,wiki,llm_judge,133,train
timewarp.134,wiki,llm_judge,134,train
timewarp.135,wiki,llm_judge,135,train
timewarp.136,wiki,llm_judge,136,train
timewarp.137,wiki,llm_judge,137,train
timewarp.138,wiki,llm_judge,138,train
timewarp.139,wiki,llm_judge,139,train
timewarp.140,wiki,llm_judge,140,train
timewarp.141,wiki,llm_judge,141,train
timewarp.142,wiki,llm_judge,142,train
timewarp.143,news,llm_judge,143,train
timewarp.144,news,llm_judge,144,train
timewarp.145,news,llm_judge,145,train
timewarp.146,news,llm_judge,146,train
timewarp.147,news,llm_judge,147,train
timewarp.148,news,llm_judge,148,train
timewarp.149,news,llm_judge,149,train
timewarp.150,news,llm_judge,150,train
timewarp.151,news,llm_judge,151,train
timewarp.152,news,llm_judge,152,train
timewarp.153,news,llm_judge,153,train
timewarp.154,news,llm_judge,154,train
timewarp.155,news,llm_judge,155,train
timewarp.156,news,llm_judge,156,train
timewarp.157,news,llm_judge,157,train
timewarp.158,news,llm_judge,158,train
timewarp.159,news,llm_judge,159,train
timewarp.160,news,llm_judge,160,train
timewarp.161,news,llm_judge,161,train
timewarp.162,news,llm_judge,162,train
timewarp.163,news,llm_judge,163,train
timewarp.164,news,llm_judge,164,train
timewarp.165,news,llm_judge,165,train
timewarp.166,news,llm_judge,166,train
timewarp.167,news,llm_judge,167,train
timewarp.168,webshop,llm_judge,168,train
timewarp.169,webshop,llm_judge,169,train
timewarp.170,webshop,llm_judge,170,train
timewarp.171,webshop,llm_judge,171,train
timewarp.172,webshop,llm_judge,172,train
timewarp.173,webshop,llm_judge,173,train
timewarp.174,webshop,llm_judge,174,train
timewarp.175,webshop,llm_judge,175,train
timewarp.176,webshop,llm_judge,176,train
timewarp.177,webshop,llm_judge,177,train
timewarp.178,webshop,llm_judge,178,train
timewarp.179,webshop,llm_judge,179,train
timewarp.180,webshop,llm_judge,180,train
timewarp.181,webshop,llm_judge,181,train
timewarp.182,webshop,llm_judge,182,train
timewarp.183,webshop,llm_judge,183,train
timewarp.184,webshop,llm_judge,184,train
timewarp.185,webshop,llm_judge,185,train
timewarp.186,webshop,llm_judge,186,train
timewarp.187,webshop,llm_judge,187,train
timewarp.188,webshop,llm_judge,188,train
timewarp.189,webshop,llm_judge,189,train
timewarp.190,webshop,llm_judge,190,train
timewarp.191,webshop,llm_judge,191,train
timewarp.192,webshop,llm_judge,192,train
timewarp.193,webshop,llm_judge,193,train
timewarp.194,webshop,llm_judge,194,train
timewarp.195,webshop,llm_judge,195,train
timewarp.196,webshop,llm_judge,196,train
timewarp.197,webshop,llm_judge,197,train
timewarp.198,webshop,llm_judge,198,train
timewarp.199,webshop,llm_judge,199,train
timewarp.200,webshop,llm_judge,200,train
timewarp.201,webshop,llm_judge,201,train
timewarp.202,webshop,llm_judge,202,train
timewarp.203,webshop,llm_judge,203,train
timewarp.204,webshop,llm_judge,204,train
timewarp.205,"wiki,news",llm_judge,205,train
timewarp.206,"wiki,news",llm_judge,206,train
timewarp.207,"wiki,webshop,news",llm_judge,207,train
timewarp.208,"news,webshop",llm_judge,208,train
timewarp.209,"wiki,news",llm_judge,209,train
timewarp.210,"wiki,webshop,news",llm_judge,210,train
timewarp.211,"news,webshop",llm_judge,211,train
timewarp.212,"wiki,webshop",llm_judge,212,train
timewarp.213,"wiki,webshop",llm_judge,213,train
timewarp.214,"wiki,news",llm_judge,214,train
timewarp.215,"wiki,news",llm_judge,215,train
timewarp.216,"wiki,news,webshop",llm_judge,216,train
timewarp.217,"wiki,news,webshop",llm_judge,217,train
timewarp.218,"news,webshop",llm_judge,218,train
timewarp.219,"wiki,news,webshop",llm_judge,219,train
timewarp.220,"wiki,news,webshop",llm_judge,220,train
timewarp.221,"wiki,news,webshop",llm_judge,221,train
timewarp.222,"wiki,webshop",llm_judge,222,train
timewarp.223,"wiki,news",llm_judge,223,train
timewarp.224,"wiki,news",llm_judge,224,train
timewarp.225,"wiki,news",llm_judge,225,train
timewarp.226,"news,webshop",llm_judge,226,train
timewarp.227,"news,webshop",llm_judge,227,train
timewarp.228,"wiki,news",llm_judge,228,train
timewarp.229,"news,webshop",llm_judge,229,train
timewarp.230,"wiki,news",llm_judge,230,train
timewarp.231,"wiki,news,webshop",llm_judge,231,train
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ def prepare_backend(backend: str):
all_tasks.extend(weblinx_browsergym.list_tasks(split=split, cache_dir=cache_dir))
demo_ids = weblinx_browsergym.get_unique_demo_ids(tasks=all_tasks)
weblinx_browsergym.download_and_unzip_demos(demo_ids=demo_ids, cache_dir=cache_dir)

case "timewarp":
import browsergym.timewarp
case _:
raise NotImplementedError(f"Unknown benchmark backend {repr(backend)}")

Expand Down
2 changes: 2 additions & 0 deletions browsergym/experiments/src/browsergym/experiments/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,8 @@ def _get_env_name(task_name: str):
import browsergym.assistantbench
elif task_name.startswith("weblinx"):
import weblinx_browsergym
elif task_name.startswith("timewarp"):
import browsergym.timewarp

return f"browsergym/{task_name}"

Expand Down
1 change: 1 addition & 0 deletions browsergym/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ dependencies = [
"weblinx-browsergym>=0.0.2",
"browsergym-webarenalite==0.14.3",
"browsergym-webarena-verified==0.14.3"
"browsergym-timewarp==0.14.3.dev0",
]

[tool.setuptools]
Expand Down
Loading