From 8aa7d8fe873f3e870426b7db8c3896927c29dd4d Mon Sep 17 00:00:00 2001 From: harjoth Date: Tue, 9 Jun 2026 17:56:51 -0700 Subject: [PATCH] gh-117807: Handle invalid UTF-8 in mimetypes comments --- Lib/mimetypes.py | 4 +-- Lib/test/test_mimetypes.py | 26 ++++++++++++++++++- ...-06-10-00-00-01.gh-issue-117807.Cx1178.rst | 2 ++ 3 files changed, 29 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-10-00-00-01.gh-issue-117807.Cx1178.rst diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py index 15e8c0a437bfd9..5381081bac2057 100644 --- a/Lib/mimetypes.py +++ b/Lib/mimetypes.py @@ -237,7 +237,7 @@ def read(self, filename, strict=True): list of standard types, else to the list of non-standard types. """ - with open(filename, encoding='utf-8') as fp: + with open(filename, encoding='utf-8', errors='replace') as fp: self.readfp(fp, strict) def readfp(self, fp, strict=True): @@ -428,7 +428,7 @@ def init(files=None): def read_mime_types(file): try: - f = open(file, encoding='utf-8') + f = open(file, encoding='utf-8', errors='replace') except OSError: return None with f: diff --git a/Lib/test/test_mimetypes.py b/Lib/test/test_mimetypes.py index 1a3b49b87b121f..1bdb5ecf05a2b3 100644 --- a/Lib/test/test_mimetypes.py +++ b/Lib/test/test_mimetypes.py @@ -67,9 +67,33 @@ def test_read_mime_types(self): with unittest.mock.patch.object(mimetypes, 'open', return_value=fp) as mock_open: mime_dict = mimetypes.read_mime_types(filename) - mock_open.assert_called_with(filename, encoding='utf-8') + mock_open.assert_called_with(filename, encoding='utf-8', + errors='replace') eq(mime_dict[".Français"], "application/no-mans-land") + def test_read_mime_types_invalid_utf8_comment(self): + with os_helper.temp_dir() as directory: + data = (b"# non-UTF-8 comment: \x83\n" + b"x-application/x-unittest pyunit\n") + file = os.path.join(directory, "sample.mimetype") + with open(file, "wb") as f: + f.write(data) + + mime_dict = mimetypes.read_mime_types(file) + self.assertEqual( + mime_dict[".pyunit"], "x-application/x-unittest") + + db = mimetypes.MimeTypes() + db.read(file) + self.assertEqual( + db.guess_file_type("sample.pyunit")[0], + "x-application/x-unittest") + + mimetypes.init(files=[file]) + self.assertEqual( + mimetypes.guess_file_type("sample.pyunit")[0], + "x-application/x-unittest") + def test_init_reinitializes(self): # Issue 4936: make sure an init starts clean # First, put some poison into the types table diff --git a/Misc/NEWS.d/next/Library/2026-06-10-00-00-01.gh-issue-117807.Cx1178.rst b/Misc/NEWS.d/next/Library/2026-06-10-00-00-01.gh-issue-117807.Cx1178.rst new file mode 100644 index 00000000000000..b3d2e1e895c7e8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-10-00-00-01.gh-issue-117807.Cx1178.rst @@ -0,0 +1,2 @@ +Fix :mod:`mimetypes` initialization from MIME map files containing invalid +UTF-8 bytes in comments.