diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index ffbaee91..9b2d8f44 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -52,8 +52,8 @@ def _is_code_column(name: str) -> bool: ) -def _read_wqp_csv(text: str) -> DataFrame: - """Read a WQP CSV, forcing code/identifier columns to ``str``. +def _read_wqp_csv(text: str, delimiter: str = ",") -> DataFrame: + """Read a WQP CSV/TSV, forcing code/identifier columns to ``str``. WQP returns codes with significant leading zeros — HUCs, parameter codes (``USGSpcode``), FIPS state/county codes. A bare ``read_csv`` infers those @@ -61,10 +61,20 @@ def _read_wqp_csv(text: str) -> DataFrame: ``"07090002"`` -> ``7090002``). Read the header first, then re-read with ``dtype=str`` for every column that :func:`_is_code_column` flags, so the zeros survive. + + ``delimiter`` selects comma (CSV, the default) vs tab (TSV); see + :func:`_wqp_delimiter`. """ - columns = pd.read_csv(StringIO(text), delimiter=",", nrows=0).columns + columns = pd.read_csv(StringIO(text), delimiter=delimiter, nrows=0).columns str_cols = {col: str for col in columns if _is_code_column(col)} - return pd.read_csv(StringIO(text), delimiter=",", low_memory=False, dtype=str_cols) + return pd.read_csv( + StringIO(text), delimiter=delimiter, low_memory=False, dtype=str_cols + ) + + +def _wqp_delimiter(kwargs: dict[str, Any]) -> str: + """Field delimiter for the requested ``mimeType``: tab for ``tsv``, else comma.""" + return "\t" if kwargs.get("mimeType") == "tsv" else "," def get_results( @@ -181,7 +191,7 @@ def get_results( response = query(url, kwargs, delimiter=";", ssl_check=ssl_check) - df = _read_wqp_csv(response.text) + df = _read_wqp_csv(response.text, _wqp_delimiter(kwargs)) df = _attach_datetime_columns(df) return df, WQP_Metadata(response, **kwargs) @@ -209,7 +219,7 @@ def _what( url = _legacy_only_url(service, legacy=legacy) response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = _read_wqp_csv(response.text) + df = _read_wqp_csv(response.text, _wqp_delimiter(kwargs)) return df, WQP_Metadata(response, **kwargs) @@ -690,9 +700,13 @@ def _check_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]: mimetype = kwargs.get("mimeType") if mimetype == "geojson": raise NotImplementedError("GeoJSON not yet supported. Set 'mimeType=csv'.") - elif mimetype != "csv" and mimetype is not None: - raise ValueError("Invalid mimeType. Set 'mimeType=csv'.") - else: + elif mimetype == "xlsx": + raise NotImplementedError( + "Excel format not yet supported. Set 'mimeType=csv' or 'mimeType=tsv'." + ) + elif mimetype not in ("csv", "tsv", None): + raise ValueError("Invalid mimeType. Supported options: 'csv', 'tsv'.") + elif mimetype is None: kwargs["mimeType"] = "csv" return kwargs diff --git a/tests/wqp_test.py b/tests/wqp_test.py index e4d0dba0..73b11923 100644 --- a/tests/wqp_test.py +++ b/tests/wqp_test.py @@ -53,6 +53,37 @@ def test_read_wqp_csv_preserves_leading_zero_codes(): assert df["ResultMeasureValue"].iloc[0] == 1.5 +def test_read_wqp_csv_tsv_delimiter_preserves_codes(): + """``mimeType=tsv`` responses are parsed as tab-delimited via + ``_read_wqp_csv``'s ``delimiter`` while still preserving leading zeros on + code columns.""" + from dataretrieval.wqp import _read_wqp_csv + + tsv = ( + "Location_HUCEightDigitCode\tUSGSpcode\tResultMeasureValue\n" + "07090002\t00060\t1.5\n" + ) + df = _read_wqp_csv(tsv, delimiter="\t") + assert list(df.columns) == [ + "Location_HUCEightDigitCode", + "USGSpcode", + "ResultMeasureValue", + ] + assert df["Location_HUCEightDigitCode"].iloc[0] == "07090002" + assert df["USGSpcode"].iloc[0] == "00060" + assert df["ResultMeasureValue"].iloc[0] == 1.5 + + +def test_wqp_delimiter_selects_tab_for_tsv(): + """``_wqp_delimiter`` maps ``mimeType=tsv`` to a tab and everything else + (including a missing mimeType) to a comma.""" + from dataretrieval.wqp import _wqp_delimiter + + assert _wqp_delimiter({"mimeType": "tsv"}) == "\t" + assert _wqp_delimiter({"mimeType": "csv"}) == "," + assert _wqp_delimiter({}) == "," + + def test_get_results(httpx_mock): """Tests water quality portal ratings query""" request_url = ( @@ -155,6 +186,16 @@ def test_check_kwargs(): kwargs = _check_kwargs(kwargs) +def test_check_kwargs_mimetype_csv_tsv_xlsx(): + """csv/tsv are accepted as-is, a missing mimeType defaults to csv, and + xlsx raises a clear NotImplementedError pointing at the csv/tsv options.""" + assert _check_kwargs({"mimeType": "csv"})["mimeType"] == "csv" + assert _check_kwargs({"mimeType": "tsv"})["mimeType"] == "tsv" + assert _check_kwargs({})["mimeType"] == "csv" + with pytest.raises(NotImplementedError, match="Excel"): + _check_kwargs({"mimeType": "xlsx"}) + + def test_get_results_wqx3_preserves_user_dataProfile(httpx_mock): """A valid user-supplied WQX3.0 profile must not be overwritten.