Skip to content

Commit fa8e6e5

Browse files
Update eval hackathon with changes to main (#794)
* Accelerate `get_infos` by caching the `DataseInfoDict`s (#778) * accelerate `get_infos` by caching the `DataseInfoDict`s * quality * consistency * fix `filter_english_datasets` since `languages` became `language` in dataset metadatas * fix empty documents - multi_news (#793) * fix empty documents - multi_news * fix test - unrecognized variable * Language tags (#771) * Added languages widget to UI. * Style fixes. * Added English tag to existing datasets. * Add languages to viewer mode. * Update language codes. * Update CONTRIBUTING.md. * Update screenshot. * Add "Prompt" to UI to clarify languages tag usage. * Add blank languages list. Co-authored-by: Victor SANH <[email protected]>
1 parent 48ade7e commit fa8e6e5

File tree

424 files changed

+6606
-607
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

424 files changed

+6606
-607
lines changed

CONTRIBUTING.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ You can always update the name later. If you want to cancel the prompt, select
2929
1. **Write the prompt**. In the box labeled "Template," enter a Jinja expression.
3030
See the [getting started guide](#getting-started-using-jinja-to-write-prompts)
3131
and [cookbook](#jinja-cookbook) for details on how to write templates.
32-
1. **Fill in metadata**. Fill in the metadata for the current prompt: reference, original task, choices in templates, and answer choices.
32+
1. **Fill in metadata**. Fill in the metadata for the current prompt: reference, original task, choices in templates, metrics, languages, and answer choices.
3333
See [Metadata](#metadata) for more details about these fields.
3434
1. **Save the prompt**. Hit the "Save" button. The output of the prompt
3535
applied to the current example will appear in the right sidebar.
@@ -124,6 +124,7 @@ to generate a question for a given answer would not.
124124
the options for the possible outputs (regardless of whether `answer_choices` is used).
125125
* **Metrics.** Use the multiselect widget to select all metrics commonly used to evaluate
126126
this task. Choose “Other” if there is one that is not included in the list.
127+
* **Languages.** Use the multiselect widget to select all languages used in the prompt. This is independent of what languages are used in the underlying dataset. For example, you could have an English prompt for a Spanish dataset.
127128
* **Answer Choices.** If the prompt has a small set of possible outputs (e.g., Yes/No,
128129
class labels, entailment judgements, etc.), then the prompt should define and use answer
129130
choices as follows. This allows evaluation to consider just the possible targets for

assets/promptsource_app.png

327 KB
Loading

promptsource/app.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME
1919
from promptsource.session import _get_state
20-
from promptsource.templates import INCLUDED_USERS, DatasetTemplates, Template, TemplateCollection
20+
from promptsource.templates import INCLUDED_USERS, LANGUAGES, METRICS, DatasetTemplates, Template, TemplateCollection
2121
from promptsource.utils import (
2222
get_dataset,
2323
get_dataset_confs,
@@ -57,6 +57,17 @@ def get_infos(all_infos, d_name):
5757
all_infos[d_name] = infos_dict
5858

5959

60+
def format_language(tag):
61+
"""
62+
Formats a language tag for display in the UI.
63+
64+
For example, if the tag is "en", then the function returns "en (English)"
65+
:param tag: language tag
66+
:return: formatted language name
67+
"""
68+
return tag + " (" + LANGUAGES[tag] + ")"
69+
70+
6071
# add an argument for read-only
6172
# At the moment, streamlit does not handle python script arguments gracefully.
6273
# Thus, for read-only mode, you have to type one of the below two:
@@ -421,6 +432,11 @@ def show_text(t, width=WIDTH, with_markdown=False):
421432
st.text(template.metadata.choices_in_prompt)
422433
st.markdown("##### Metrics")
423434
st.text(", ".join(template.metadata.metrics) if template.metadata.metrics else None)
435+
st.markdown("##### Prompt Languages")
436+
if template.metadata.languages:
437+
st.text(", ".join([format_language(tag) for tag in template.metadata.languages]))
438+
else:
439+
st.text(None)
424440
st.markdown("##### Answer Choices")
425441
if template.get_answer_choices_expr() is not None:
426442
show_jinja(template.get_answer_choices_expr())
@@ -559,35 +575,24 @@ def show_text(t, width=WIDTH, with_markdown=False):
559575
help="Prompt explicitly lists choices in the template for the output.",
560576
)
561577

562-
# Metrics from here:
563-
# https://github.com/google-research/text-to-text-transfer-transformer/blob/4b580f23968c2139be7fb1cd53b22c7a7f686cdf/t5/evaluation/metrics.py
564-
metrics_choices = [
565-
"BLEU",
566-
"ROUGE",
567-
"Squad",
568-
"Trivia QA",
569-
"Accuracy",
570-
"Pearson Correlation",
571-
"Spearman Correlation",
572-
"MultiRC",
573-
"AUC",
574-
"COQA F1",
575-
"Edit Distance",
576-
]
577-
# Add mean reciprocal rank
578-
metrics_choices.append("Mean Reciprocal Rank")
579-
# Add generic other
580-
metrics_choices.append("Other")
581-
# Sort alphabetically
582-
metrics_choices = sorted(metrics_choices)
583578
state.metadata.metrics = st.multiselect(
584579
"Metrics",
585-
metrics_choices,
580+
sorted(METRICS),
586581
default=template.metadata.metrics,
587582
help="Select all metrics that are commonly used (or should "
588583
"be used if a new task) to evaluate this prompt.",
589584
)
590585

586+
state.metadata.languages = st.multiselect(
587+
"Prompt Languages",
588+
sorted(LANGUAGES.keys()),
589+
default=template.metadata.languages,
590+
format_func=format_language,
591+
help="Select all languages used in this prompt. "
592+
"This annotation is independent from the language(s) "
593+
"of the dataset.",
594+
)
595+
591596
# Answer choices
592597
if template.get_answer_choices_expr() is not None:
593598
answer_choices = template.get_answer_choices_expr()

promptsource/templates.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,212 @@
2929

3030
INCLUDED_USERS = {"Zaid", "craffel", "GEM", "aps", "khalidalt", "shanya", "rbawden", "BigScienceBiasEval", "gsarti"}
3131

32+
# These are the metrics with which templates can be tagged
33+
METRICS = {
34+
"BLEU",
35+
"ROUGE",
36+
"Squad",
37+
"Trivia QA",
38+
"Accuracy",
39+
"Pearson Correlation",
40+
"Spearman Correlation",
41+
"MultiRC",
42+
"AUC",
43+
"COQA F1",
44+
"Edit Distance",
45+
"Mean Reciprocal Rank",
46+
"Other",
47+
}
48+
49+
# These are the languages with which templates can be tagged. Keys are ISO 639-1
50+
# tags, which are the actual tags we use. Values are English names shown in the
51+
# UI for convenience.
52+
LANGUAGES = {
53+
"ab": "Abkhazian",
54+
"aa": "Afar",
55+
"af": "Afrikaans",
56+
"ak": "Akan",
57+
"sq": "Albanian",
58+
"am": "Amharic",
59+
"ar": "Arabic",
60+
"an": "Aragonese",
61+
"hy": "Armenian",
62+
"as": "Assamese",
63+
"av": "Avaric",
64+
"ae": "Avestan",
65+
"ay": "Aymara",
66+
"az": "Azerbaijani",
67+
"bm": "Bambara",
68+
"ba": "Bashkir",
69+
"eu": "Basque",
70+
"be": "Belarusian",
71+
"bn": "Bengali",
72+
"bi": "Bislama",
73+
"bs": "Bosnian",
74+
"br": "Breton",
75+
"bg": "Bulgarian",
76+
"my": "Burmese",
77+
"ca": "Catalan, Valencian",
78+
"ch": "Chamorro",
79+
"ce": "Chechen",
80+
"ny": "Chichewa, Chewa, Nyanja",
81+
"zh": "Chinese",
82+
"cu": "Church Slavic, Old Slavonic, Church Slavonic, Old Bulgarian, Old Church Slavonic",
83+
"cv": "Chuvash",
84+
"kw": "Cornish",
85+
"co": "Corsican",
86+
"cr": "Cree",
87+
"hr": "Croatian",
88+
"cs": "Czech",
89+
"da": "Danish",
90+
"dv": "Divehi, Dhivehi, Maldivian",
91+
"nl": "Dutch, Flemish",
92+
"dz": "Dzongkha",
93+
"en": "English",
94+
"eo": "Esperanto",
95+
"et": "Estonian",
96+
"ee": "Ewe",
97+
"fo": "Faroese",
98+
"fj": "Fijian",
99+
"fi": "Finnish",
100+
"fr": "French",
101+
"fy": "Western Frisian",
102+
"ff": "Fulah",
103+
"gd": "Gaelic, Scottish Gaelic",
104+
"gl": "Galician",
105+
"lg": "Ganda",
106+
"ka": "Georgian",
107+
"de": "German",
108+
"el": "Greek, Modern (1453–)",
109+
"kl": "Kalaallisut, Greenlandic",
110+
"gn": "Guarani",
111+
"gu": "Gujarati",
112+
"ht": "Haitian, Haitian Creole",
113+
"ha": "Hausa",
114+
"he": "Hebrew",
115+
"hz": "Herero",
116+
"hi": "Hindi",
117+
"ho": "Hiri Motu",
118+
"hu": "Hungarian",
119+
"is": "Icelandic",
120+
"io": "Ido",
121+
"ig": "Igbo",
122+
"id": "Indonesian",
123+
"ia": "Interlingua (International Auxiliary Language Association)",
124+
"ie": "Interlingue, Occidental",
125+
"iu": "Inuktitut",
126+
"ik": "Inupiaq",
127+
"ga": "Irish",
128+
"it": "Italian",
129+
"ja": "Japanese",
130+
"jv": "Javanese",
131+
"kn": "Kannada",
132+
"kr": "Kanuri",
133+
"ks": "Kashmiri",
134+
"kk": "Kazakh",
135+
"km": "Central Khmer",
136+
"ki": "Kikuyu, Gikuyu",
137+
"rw": "Kinyarwanda",
138+
"ky": "Kirghiz, Kyrgyz",
139+
"kv": "Komi",
140+
"kg": "Kongo",
141+
"ko": "Korean",
142+
"kj": "Kuanyama, Kwanyama",
143+
"ku": "Kurdish",
144+
"lo": "Lao",
145+
"la": "Latin",
146+
"lv": "Latvian",
147+
"li": "Limburgan, Limburger, Limburgish",
148+
"ln": "Lingala",
149+
"lt": "Lithuanian",
150+
"lu": "Luba-Katanga",
151+
"lb": "Luxembourgish, Letzeburgesch",
152+
"mk": "Macedonian",
153+
"mg": "Malagasy",
154+
"ms": "Malay",
155+
"ml": "Malayalam",
156+
"mt": "Maltese",
157+
"gv": "Manx",
158+
"mi": "Maori",
159+
"mr": "Marathi",
160+
"mh": "Marshallese",
161+
"mn": "Mongolian",
162+
"na": "Nauru",
163+
"nv": "Navajo, Navaho",
164+
"nd": "North Ndebele",
165+
"nr": "South Ndebele",
166+
"ng": "Ndonga",
167+
"ne": "Nepali",
168+
"no": "Norwegian",
169+
"nb": "Norwegian Bokmål",
170+
"nn": "Norwegian Nynorsk",
171+
"ii": "Sichuan Yi, Nuosu",
172+
"oc": "Occitan",
173+
"oj": "Ojibwa",
174+
"or": "Oriya",
175+
"om": "Oromo",
176+
"os": "Ossetian, Ossetic",
177+
"pi": "Pali",
178+
"ps": "Pashto, Pushto",
179+
"fa": "Persian",
180+
"pl": "Polish",
181+
"pt": "Portuguese",
182+
"pa": "Punjabi, Panjabi",
183+
"qu": "Quechua",
184+
"ro": "Romanian, Moldavian, Moldovan",
185+
"rm": "Romansh",
186+
"rn": "Rundi",
187+
"ru": "Russian",
188+
"se": "Northern Sami",
189+
"sm": "Samoan",
190+
"sg": "Sango",
191+
"sa": "Sanskrit",
192+
"sc": "Sardinian",
193+
"sr": "Serbian",
194+
"sn": "Shona",
195+
"sd": "Sindhi",
196+
"si": "Sinhala, Sinhalese",
197+
"sk": "Slovak",
198+
"sl": "Slovenian",
199+
"so": "Somali",
200+
"st": "Southern Sotho",
201+
"es": "Spanish, Castilian",
202+
"su": "Sundanese",
203+
"sw": "Swahili",
204+
"ss": "Swati",
205+
"sv": "Swedish",
206+
"tl": "Tagalog",
207+
"ty": "Tahitian",
208+
"tg": "Tajik",
209+
"ta": "Tamil",
210+
"tt": "Tatar",
211+
"te": "Telugu",
212+
"th": "Thai",
213+
"bo": "Tibetan",
214+
"ti": "Tigrinya",
215+
"to": "Tonga (Tonga Islands)",
216+
"ts": "Tsonga",
217+
"tn": "Tswana",
218+
"tr": "Turkish",
219+
"tk": "Turkmen",
220+
"tw": "Twi",
221+
"ug": "Uighur, Uyghur",
222+
"uk": "Ukrainian",
223+
"ur": "Urdu",
224+
"uz": "Uzbek",
225+
"ve": "Venda",
226+
"vi": "Vietnamese",
227+
"vo": "Volapük",
228+
"wa": "Walloon",
229+
"cy": "Welsh",
230+
"wo": "Wolof",
231+
"xh": "Xhosa",
232+
"yi": "Yiddish",
233+
"yo": "Yoruba",
234+
"za": "Zhuang, Chuang",
235+
"zu": "Zulu",
236+
}
237+
32238

33239
def highlight(input):
34240
return "<span style='color: #F08080'>" + input + "</span>"
@@ -229,6 +435,7 @@ def __init__(
229435
original_task: Optional[bool] = None,
230436
choices_in_prompt: Optional[bool] = None,
231437
metrics: Optional[List[str]] = None,
438+
languages: Optional[List[str]] = None,
232439
):
233440
"""
234441
Initializes template metadata.
@@ -242,10 +449,12 @@ def __init__(
242449
:param choices_in_prompt: If True, the answer choices are included in the templates such that models
243450
see those choices in the input. Only applicable to classification tasks.
244451
:param metrics: List of strings denoting metrics to use for evaluation
452+
:param metrics: List of strings denoting languages used in the prompt (not the associated dataset!)
245453
"""
246454
self.original_task = original_task
247455
self.choices_in_prompt = choices_in_prompt
248456
self.metrics = metrics
457+
self.languages = languages
249458

250459

251460
class TemplateCollection:
@@ -505,6 +714,7 @@ def get_templates_data_frame():
505714
"original_task": [],
506715
"choices_in_prompt": [],
507716
"metrics": [],
717+
"languages": [],
508718
"answer_choices": [],
509719
"jinja": [],
510720
}
@@ -523,6 +733,7 @@ def get_templates_data_frame():
523733
data["original_task"].append(template.metadata.original_task)
524734
data["choices_in_prompt"].append(template.metadata.choices_in_prompt)
525735
data["metrics"].append(template.metadata.metrics)
736+
data["languages"].append(template.metadata.languages)
526737
data["answer_choices"].append(template.get_answer_choices_expr())
527738
data["jinja"].append(template.jinja)
528739

0 commit comments

Comments
 (0)