Skip to content

UnicodeEncodeError: 'charmap' codec can't encode character '\u2265' in position 256 #53

@rkclimate20

Description

@rkclimate20

Run the test test/test_extract_text.py::ExtractTextTest::test_extract_title_id_para_from_ipcc_syr

System: Windows

Output Report

 ExtractTextTest.test_extract_title_id_para_from_ipcc_syr ___________________________________________________

self = <test.test_extract_text.ExtractTextTest testMethod=test_extract_title_id_para_from_ipcc_syr>

    def test_extract_title_id_para_from_ipcc_syr(self):
        """
        read a chapter from IPCC and extract paras with ids and their section titles

        Purpose is to create CSV for input to LLM/RAG
        reads IPCC SYR report, finds all divs with paragraphs and returns the title and
        first para.
        """

        chapter = "longer-report"
        wg = "syr"
        infile = Path(Resources.TEST_RESOURCES_DIR, "ipcc", "cleaned_content",
                      wg, chapter, "html_with_ids.html")
        outdir = Path(Resources.TEMP_DIR, "csv", "ipcc")
        outfile_name = "syr_paras.csv"
        csvout = Path(outdir, outfile_name)

>       MiscLib.create_and_write_csv(
            infile,
            outdir,
            csvout,
            div_with_p_with_ids_xpath=".//body//div[p[@id]]",
            para_xpath=".//p",
            title_xpath=".//h2/text()|.//h3/text()|.//h4/text()")

test\test_extract_text.py:236:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test\test_extract_text.py:399: in create_and_write_csv
    cls._write_csv(csvout, divs_with_p_with_ids, para_xpath, title_xpath, wg, chap)
test\test_extract_text.py:375: in _write_csv
    cls._add_ids_and_text_as_csv_row(csvwriter, div_title, para, wg, chap)
test\test_extract_text.py:359: in _add_ids_and_text_as_csv_row
    csvwriter.writerow(row)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <encodings.cp1252.IncrementalEncoder object at 0x0000018C792CDC40>
input = 'None_None_3.1.1_p3,NO_TITLE,"Modelled pathways consistent with the continuation of policies implemented by the end of...te sensitivity or carbon cycle feedbacks are higher than the best estimate (high confidence). {WGIII SPM C.1.3}"\r\r\n'
final = False

    def encode(self, input, final=False):
>       return codecs.charmap_encode(input,self.errors,encoding_table)[0]
E       UnicodeEncodeError: 'charmap' codec can't encode character '\u2265' in position 256: character maps to <undefined>

..\..\..\anaconda3\envs\amienv\Lib\encodings\cp1252.py:19: UnicodeEncodeError
=================================================================== short test summary info ====================================================================
FAILED test/test_extract_text.py::ExtractTextTest::test_extract_title_id_para_from_ipcc_syr - UnicodeEncodeError: 'charmap' codec can't encode character '\u2265' in position 256: character maps to <undefined>
================================================================= 1 failed in 79.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions