Skip to content

Commit 5dea218

Browse files
authored
Remove English-only filter. (#755)
1 parent 06bd60d commit 5dea218

File tree

1 file changed

+7
-16
lines changed

1 file changed

+7
-16
lines changed

promptsource/utils.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,13 @@ def render_features(features):
107107
#
108108

109109

110-
def filter_english_datasets():
110+
def filter_datasets():
111111
"""
112-
Filter English datasets based on language tags in metadata.
112+
Filter datasets from HuggingFace API.
113113
114114
Also includes the datasets of any users listed in INCLUDED_USERS
115115
"""
116-
english_datasets = []
116+
filtered_datasets = []
117117

118118
response = requests.get("https://huggingface.co/api/datasets?full=true")
119119
tags = response.json()
@@ -125,25 +125,16 @@ def filter_english_datasets():
125125
if is_community_dataset:
126126
user = dataset_name.split("/")[0]
127127
if user in INCLUDED_USERS:
128-
english_datasets.append(dataset_name)
128+
filtered_datasets.append(dataset_name)
129129
continue
130130

131-
if "cardData" not in dataset:
132-
continue
133-
metadata = dataset["cardData"]
134-
135-
if "languages" not in metadata:
136-
continue
137-
languages = metadata["languages"]
138-
139-
if "en" in languages or "en-US" in languages:
140-
english_datasets.append(dataset_name)
131+
filtered_datasets.append(dataset_name)
141132

142-
return sorted(english_datasets)
133+
return sorted(filtered_datasets)
143134

144135

145136
def list_datasets():
146137
"""Get all the datasets to work with."""
147-
dataset_list = filter_english_datasets()
138+
dataset_list = filter_datasets()
148139
dataset_list.sort(key=lambda x: x.lower())
149140
return dataset_list

0 commit comments

Comments
 (0)