case study

import%20marimo%0A%0A__generated_with%20%3D%20%220.18.4%22%0Aapp%20%3D%20marimo.App(width%3D%22medium%22)%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20marimo%20as%20mo%0A%20%20%20%20return%20(mo%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%20Case%20Study%3A%20Structured%20Crisis%20Tweet%20Analysis%0A%0A%20%20%20%20This%20notebook%20demonstrates%20how%20to%20use%20%60sieves%60%20to%20build%20a%20structured%20information%20extraction%20pipeline.%20We'll%20use%20a%20**toy%20example**%20involving%20crisis-related%20tweets%20to%20show%20how%20%60sieves%60%20can%20help%20sift%20through%20unstructured%20text%20to%20identify%20relevant%20events%20and%20extract%20key%20entities.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20The%20goal%20is%20to%20build%20a%20multi-stage%20filter%20(a%20sieve%2C%20if%20you%20will)%20that%3A%0A%20%20%20%201.%20**Classifies**%20if%20a%20tweet%20is%20relevant.%0A%20%20%20%202.%20**Extracts**%20the%20crisis%20type%20and%20location%20**only%20if**%20it's%20relevant.%0A%0A%20%20%20%20This%20conditional%20orchestration%20allows%20for%20efficient%20processing%20and%20reduced%20noise.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Downloading%20the%20Data%0A%0A%0A%20%20%20%20We'll%20first%20download%20a%20%5Bdataset%5D(https%3A%2F%2Fcrisisnlp.qcri.org%2Fdata%2Flrec2016%2Flabeled_cf%2FCrisisNLP_labeled_data_crowdflower_v2.zip)%20from%20https%3A%2F%2Fcrisisnlp.qcri.org%2F.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20urllib.request%0A%20%20%20%20import%20zipfile%0A%20%20%20%20from%20pathlib%20import%20Path%0A%20%20%20%20import%20shutil%0A%0A%20%20%20%20url%20%3D%20%22https%3A%2F%2Fcrisisnlp.qcri.org%2Fdata%2Flrec2016%2Flabeled_cf%2FCrisisNLP_labeled_data_crowdflower_v2.zip%22%0A%20%20%20%20zip_path%20%3D%20Path(%22CrisisNLP_dataset.zip%22)%0A%20%20%20%20data_dir%20%3D%20Path(%22CrisisNLP%22)%0A%0A%20%20%20%20if%20not%20zip_path.exists()%3A%0A%20%20%20%20%20%20%20%20urllib.request.urlretrieve(url%2C%20str(zip_path))%0A%0A%20%20%20%20with%20zipfile.ZipFile(str(zip_path)%2C%20'r')%20as%20z%3A%0A%20%20%20%20%20%20%20%20z.extractall(%22.%22)%0A%0A%20%20%20%20extracted%20%3D%20Path(%22CrisisNLP_labeled_data_crowdflower%22)%0A%20%20%20%20data_dir.mkdir(exist_ok%3DTrue)%0A%20%20%20%20for%20item%20in%20extracted.iterdir()%3A%0A%20%20%20%20%20%20%20%20target%20%3D%20data_dir%20%2F%20item.name%0A%20%20%20%20%20%20%20%20if%20not%20target.exists()%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20item.rename(target)%0A%20%20%20%20shutil.rmtree(extracted)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Next%20we'll%20load%20the%20data%20into%20memory%20and%20preprocess%20it.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20pandas%20as%20pd%0A%20%20%20%20import%20os%0A%0A%20%20%20%20data%3A%20pd.DataFrame%20%3D%20pd.DataFrame()%0A%0A%20%20%20%20for%20file%20in%20os.listdir(%22CrisisNLP%22)%3A%0A%20%20%20%20%20%20%20%20if%20os.path.isdir(os.path.join(%22CrisisNLP%22%2C%20file))%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20for%20file_%20in%20os.listdir(os.path.join(%22CrisisNLP%22%2C%20file))%3A%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20file_.endswith(%22.tsv%22)%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20data_%20%3D%20pd.read_csv(os.path.join(%22CrisisNLP%22%2C%20file%2C%20file_)%2C%20sep%3D%22%5Ct%22)%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20len(data_)%20%3D%3D%200%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20continue%0A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20data_%5B%22dataset%22%5D%20%3D%20file.split(%22_%22)%5B1%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20crisis_type%20%3D%20file.split(%22_%22)%5B2%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20if%20crisis_type%20%3D%3D%20'eq'%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20crisis_type%20%3D%20'Earthquake'%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20crisis_type%20in%20('Odile'%2C%20'Pam'%2C%20'Typhoon')%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20crisis_type%20%3D%20'Hurrican'%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20crisis_type%20in%20('ebola'%2C%20'cf'%2C%20'East')%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20crisis_type%20%3D%20'Diseases'%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20elif%20crisis_type%20%3D%3D%20'floods'%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20crisis_type%20%3D%20'Floods'%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20data_%5B%22crisis_type%22%5D%20%3D%20crisis_type%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20data%20%3D%20pd.concat(%5Bdata%2C%20data_%5D)%0A%20%20%20%20return%20data%2C%20os%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Data%20Exploration%0A%0A%20%20%20%20We'll%20load%20the%20CrisisNLP%20dataset%2C%20which%20contains%20tweets%20labeled%20by%20humans%20across%20different%20disaster%20types.%20This%20%22gold%22%20data%20allows%20us%20to%20evaluate%20our%20automated%20pipeline%20later.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(data%3A%20%22pd.DataFrame%22%2C%20mo)%3A%0A%20%20%20%20mo.ui.table(data.head(100)%2C%20label%3D%22CrisisNLP%20Dataset%20Preview%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(data%3A%20%22pd.DataFrame%22)%3A%0A%20%20%20%20%23%20Quick%20summary%20of%20the%20dataset%0A%20%20%20%20summary%20%3D%20%7B%0A%20%20%20%20%20%20%20%20%22Total%20Tweets%22%3A%20len(data)%2C%0A%20%20%20%20%20%20%20%20%22Crisis%20Types%22%3A%20data.crisis_type.unique().tolist()%2C%0A%20%20%20%20%20%20%20%20%22Labels%22%3A%20data.label.unique().tolist()%0A%20%20%20%20%7D%0A%20%20%20%20summary%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Now%20we'll%20convert%20the%20data%20into%20%60sieves%60%20documents%2C%20which%20will%20allow%20us%20to%20process%20them.%20We'll%20also%20sample%20down%20the%20data%20to%20cut%20down%20the%20time%20we%20need%20for%20processing%20the%20data.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(data%3A%20%22pd.DataFrame%22)%3A%0A%20%20%20%20from%20sieves%20import%20Doc%0A%20%20%20%20from%20sieves.tasks.predictive.classification%20import%20ResultSingleLabel%0A%20%20%20%20import%20random%0A%0A%20%20%20%20%23%20We%20down-sample%20our%20dataset%20to%20avoid%20long%20processing%20times.%0A%20%20%20%20data_sampled%20%3D%20data.sample(n%3D100)%0A%0A%20%20%20%20docs%20%3D%20%5B%0A%20%20%20%20%20%20%20%20Doc(%0A%20%20%20%20%20%20%20%20%20%20%20%20uri%3Df%22tweet_%7Brow%5B'tweet_id'%5D%7D%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20text%3Drow%5B'tweet_text'%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20meta%3D%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20'tweet_id'%3A%20row%5B'tweet_id'%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20'dataset'%3A%20row%5B'dataset'%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%23%20We%20store%20ground%20truth%20in%20.gold%20per%20task%20ID%2C%20so%20we%20can%20evaluate%20our%20pipeline%20performance%20later.%0A%20%20%20%20%20%20%20%20%20%20%20%20gold%3D%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22crisis_label_classifier%22%3A%20%20row%5B'label'%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22crisis_type_classifier%22%3A%20row%5B'crisis_type'%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20)%0A%20%20%20%20%20%20%20%20for%20idx%2C%20row%20in%20data_sampled.iterrows()%0A%20%20%20%20%5D%0A%0A%20%20%20%20print(f%22Created%20%7Blen(docs)%7D%20docs.%22)%0A%20%20%20%20print(f%22%5CnSample%20doc%3A%22)%0A%20%20%20%20print(f%22Tweet%3A%20%7Bdocs%5B0%5D.text%5B%3A100%5D%7D...%22)%0A%20%20%20%20print(f%22Metadata%3A%20%7Bdocs%5B0%5D.meta%7D%22)%0A%20%20%20%20print(f%22Ground%20truth%3A%20%7Bdocs%5B0%5D.gold%7D%22)%0A%20%20%20%20return%20Doc%2C%20data_sampled%2C%20docs%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Setting%20up%20the%20Pipeline%0A%0A%20%20%20%20A%20%60sieves%60%20pipeline%20is%20composed%20of%20tasks.%20We'll%20use%3A%0A%20%20%20%201.%20**ClassificationTask**%3A%20To%20filter%20for%20relevance%20and%20identify%20crisis%20types.%0A%20%20%20%202.%20**InformationExtractionTask**%3A%20To%20extract%20structured%20entities%20(Locations)%20using%20Pydantic%20models.%0A%0A%20%20%20%20%23%23%23%20Conditional%20Orchestration%0A%20%20%20%20We%20only%20want%20to%20run%20the%20expensive%20extraction%20tasks%20on%20tweets%20that%20are%20actually%20related%20to%20a%20crisis.%20%60sieves%60%20allows%20us%20to%20define%20a%20%60condition%60%20function%20that%20acts%20as%20a%20gatekeeper.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(os)%3A%0A%20%20%20%20import%20dspy%0A%0A%20%20%20%20%23%20We%20use%20a%20lightweight%20model%20for%20this%20demonstration%0A%20%20%20%20model%20%3D%20dspy.LM(%0A%20%20%20%20%20%20%20%20%22openrouter%2Fgoogle%2Fgemini-2.5-flash-lite-preview-09-2025%22%2C%0A%20%20%20%20%20%20%20%20api_base%3D%22https%3A%2F%2Fopenrouter.ai%2Fapi%2Fv1%22%2C%0A%20%20%20%20%20%20%20%20api_key%3Dos.environ%5B'OPENROUTER_API_KEY'%5D%2C%0A%20%20%20%20)%0A%0A%20%20%20%20%23%20Batching%20improves%20throughput%20by%20processing%20multiple%20docs%20in%20a%20single%20prompt%0A%20%20%20%20batch_size%20%3D%2010%0A%20%20%20%20return%20batch_size%2C%20model%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Let's%20create%20a%20task%20predicting%20tweet%20labels%2C%20one%20of%20which%20indicates%20whether%20the%20tweet%20is%20being%20relevant%20to%20any%20crisis%3A%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(batch_size%2C%20data_sampled%2C%20model)%3A%0A%20%20%20%20from%20sieves%20import%20tasks%0A%0A%20%20%20%20crisis_label_classifier%20%3D%20tasks.Classification(%0A%20%20%20%20%20%20%20%20task_id%3D%22crisis_label_classifier%22%2C%0A%20%20%20%20%20%20%20%20labels%3Ddata_sampled.label.unique()%2C%0A%20%20%20%20%20%20%20%20mode%3D'single'%2C%0A%20%20%20%20%20%20%20%20model%3Dmodel%2C%0A%20%20%20%20%20%20%20%20batch_size%3Dbatch_size%2C%0A%20%20%20%20)%0A%20%20%20%20return%20crisis_label_classifier%2C%20tasks%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Defining%20the%20%22Gatekeeper%22%0A%20%20%20%20The%20%60related_to_crisis%60%20function%20checks%20the%20output%20of%20the%20first%20classifier.%20If%20the%20tweet%20isn't%20relevant%20or%20the%20confidence%20is%20too%20low%2C%20subsequent%20tasks%20in%20the%20pipeline%20will%20be%20skipped%20for%20that%20document.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(Doc%2C%20batch_size%2C%20data_sampled%2C%20model%2C%20tasks)%3A%0A%20%20%20%20def%20related_to_crisis(doc%3A%20Doc)%20-%3E%20bool%3A%0A%20%20%20%20%20%20%20%20%22%22%22Checks%20if%20the%20tweet%20is%20relevant%20enough%20to%20proceed%20with%20further%20extraction.%22%22%22%0A%20%20%20%20%20%20%20%20result%20%3D%20doc.results.get(%22crisis_label_classifier%22)%0A%20%20%20%20%20%20%20%20if%20not%20result%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20return%20False%0A%20%20%20%20%20%20%20%20return%20result.label%20!%3D%20'not_related_or_irrelevant'%20and%20result.score%20%3E%3D%20.6%0A%0A%0A%20%20%20%20crisis_type_classifier%20%3D%20tasks.Classification(%0A%20%20%20%20%20%20%20%20task_id%3D%22crisis_type_classifier%22%2C%0A%20%20%20%20%20%20%20%20labels%3Ddata_sampled.crisis_type.unique()%2C%0A%20%20%20%20%20%20%20%20mode%3D'single'%2C%0A%20%20%20%20%20%20%20%20model%3Dmodel%2C%0A%20%20%20%20%20%20%20%20condition%3Drelated_to_crisis%2C%0A%20%20%20%20%20%20%20%20batch_size%3Dbatch_size%2C%0A%20%20%20%20)%0A%20%20%20%20return%20crisis_type_classifier%2C%20related_to_crisis%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Schema-Driven%20Information%20Extraction%0A%20%20%20%20By%20using%20Pydantic%20models%2C%20we%20ensure%20that%20the%20LLM%20output%20is%20parsed%20into%20a%20structured%20object.%20This%20makes%20the%20data%20immediately%20useful%20for%20downstream%20applications.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_()%3A%0A%20%20%20%20import%20pydantic%0A%20%20%20%20from%20typing%20import%20Literal%0A%0A%0A%20%20%20%20class%20Country(pydantic.BaseModel%2C%20frozen%3DTrue)%3A%0A%20%20%20%20%20%20%20%20name%3A%20str%20%7C%20None%20%3D%20pydantic.Field(%0A%20%20%20%20%20%20%20%20%20%20%20%20description%3D%22The%20name%20of%20the%20country%20mentioned%20in%20the%20tweet%2C%20if%20any.%22%0A%20%20%20%20%20%20%20%20)%0A%20%20%20%20return%20(Country%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Having%20the%20entity%20types%20defined%2C%20we%20can%20proceed%20to%20define%20our%20tasks.%20We'll%20reuse%20the%20conditional%20check%20from%20before%20that%20allows%20us%20to%20skip%20irrelevant%20tweets.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(Country%2C%20batch_size%2C%20model%2C%20related_to_crisis%2C%20tasks)%3A%0A%20%20%20%20location_extractor%20%3D%20tasks.InformationExtraction(%0A%20%20%20%20%20%20%20%20task_id%3D%22location_extractor%22%2C%0A%20%20%20%20%20%20%20%20entity_type%3DCountry%2C%0A%20%20%20%20%20%20%20%20model%3Dmodel%2C%0A%20%20%20%20%20%20%20%20mode%3D'single'%2C%0A%20%20%20%20%20%20%20%20batch_size%3Dbatch_size%2C%0A%20%20%20%20%20%20%20%20condition%3Drelated_to_crisis%0A%20%20%20%20)%0A%20%20%20%20return%20(location_extractor%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20Finally%20we%20can%20combine%20four%20tasks%20into%20a%20pipeline%3A%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(crisis_label_classifier%2C%20crisis_type_classifier%2C%20location_extractor)%3A%0A%20%20%20%20pipeline%20%3D%20(%0A%20%20%20%20%20%20%20%20crisis_label_classifier%20%2B%0A%20%20%20%20%20%20%20%20crisis_type_classifier%20%2B%0A%20%20%20%20%20%20%20%20location_extractor%0A%20%20%20%20)%0A%20%20%20%20return%20(pipeline%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%20Evaluating%20Results%0A%0A%20%20%20%20%23%23%23%20Running%20the%20Pipeline%0A%0A%20%20%20%20Everything%20is%20in%20place%2C%20so%20we%20run%20our%20pipeline%20and%20collect%20results%3A%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(docs%2C%20pipeline)%3A%0A%20%20%20%20results%20%3D%20list(pipeline(docs))%0A%20%20%20%20return%20(results%2C)%0A%0A%0A%40app.cell%0Adef%20_(results)%3A%0A%20%20%20%20from%20typing%20import%20Any%0A%0A%20%20%20%20%23%20Flatten%20results%20for%20display%20in%20a%20table%0A%20%20%20%20display_results%3A%20list%5Bdict%5Bstr%2C%20Any%5D%5D%20%3D%20%5B%5D%0A%20%20%20%20for%20doc%20in%20results%3A%0A%20%20%20%20%20%20%20%20display_results.append(%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%22Tweet%22%3A%20doc.text%5B%3A100%5D%20%2B%20%22...%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22Label%22%3A%20doc.results.get('crisis_label_classifier').label%20if%20doc.results.get('crisis_label_classifier')%20else%20%22N%2FA%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22Type%22%3A%20doc.results.get('crisis_type_classifier').label%20if%20doc.results.get('crisis_type_classifier')%20else%20%22Skipped%22%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%22Location%22%3A%20doc.results.get('location_extractor').entity.name%20if%20doc.results.get('location_extractor')%20and%20doc.results.get('location_extractor').entity%20else%20%22None%2FSkipped%22%0A%20%20%20%20%20%20%20%20%7D)%0A%20%20%20%20return%20(display_results%2C)%0A%0A%0A%40app.cell%0Adef%20_(display_results%3A%20%22list%5Bdict%5Bstr%2C%20Any%5D%5D%22%2C%20mo)%3A%0A%20%20%20%20mo.ui.table(display_results)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20%23%23%23%20Evaluating%20Pipeline%20Reliability%0A%0A%20%20%20%20In%20a%20real-world%20scenario%2C%20we%20need%20to%20know%20if%20we%20can%20trust%20our%20automated%20extraction.%20By%20comparing%20our%20pipeline's%20%22Predicted%22%20results%20against%20the%20%22Gold%22%20human%20labels%2C%20we%20can%20calculate%20metrics%20like%20Accuracy%20and%20F1%20Score.%0A%0A%20%20%20%20We%20do%20this%20by%20running%20the%20inbuilt%20evaluation%20functionality.%20Note%20that%20we%20don't%20have%20ground%20truth%20for%20the%20location%20extraction%2C%20so%20we'll%20ignore%20this%20task%20in%20our%20evaluation.%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(pipeline%2C%20results)%3A%0A%20%20%20%20from%20pprint%20import%20pprint%0A%0A%20%20%20%20eval_report%20%3D%20pipeline.evaluate(results)%0A%20%20%20%20for%20task_id%20in%20eval_report.reports%3A%0A%20%20%20%20%20%20%20%20pprint(eval_report%5Btask_id%5D.summary())%0A%20%20%20%20return%20(eval_report%2C)%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_(mo)%3A%0A%20%20%20%20mo.md(r%22%22%22%0A%20%20%20%20That%20seems%20quite%20fine%20for%20%60crisis_type%60%2C%20and%20not%20so%20great%20for%20%60crisis_label%60%20-%20the%20mediocre%20performance%20for%20labels%20is%20probably%20because%20there%20are%20quite%20a%20few%20of%20them%20(15)%20and%20the%20model%20has%20difficulties%20telling%20them%20apart.%0A%0A%20%20%20%20Let's%20inspect%20the%20errors%20-%20hopefully%20we%20can%20learn%20from%20to%20improve%20our%20pipeline%20(or%20maybe%20even%20find%20mistakes%20in%20the%20gold%20data!)%3A%0A%20%20%20%20%22%22%22)%0A%20%20%20%20return%0A%0A%0A%40app.cell%0Adef%20_(eval_report%2C%20mo)%3A%0A%20%20%20%20from%20sklearn.metrics%20import%20f1_score%2C%20accuracy_score%2C%20recall_score%0A%0A%20%20%20%20errors%3A%20list%5Bdict%5Bstr%2C%20str%20%7C%20float%5D%5D%20%3D%20%5B%5D%0A%0A%20%20%20%20for%20tid%20in%20(%22crisis_label_classifier%22%2C%20%22crisis_type_classifier%22)%3A%0A%20%20%20%20%20%20%20%20for%20failed_pred%20in%20eval_report%5Btid%5D.failures%3A%0A%20%20%20%20%20%20%20%20%20%20%20%20pred%20%3D%20failed_pred.results%5Btid%5D%0A%20%20%20%20%20%20%20%20%20%20%20%20errors.append(%7B%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Task%22%3A%20tid%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Text%22%3A%20failed_pred.text%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Gold%22%3A%20failed_pred.gold%5Btid%5D%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%22Predicted%22%3A%20pred.label%20if%20pred%20else%20pred%2C%0A%20%20%20%20%20%20%20%20%20%20%20%20%7D)%0A%0A%20%20%20%20mo.ui.table(errors)%0A%20%20%20%20return%0A%0A%0A%40app.cell(hide_code%3DTrue)%0Adef%20_()%3A%0A%20%20%20%20return%0A%0A%0Aif%20__name__%20%3D%3D%20%22__main__%22%3A%0A%20%20%20%20app.run()%0A