From 310e7f77b1ff2300ef7ba054f4493d0dff006be4 Mon Sep 17 00:00:00 2001 From: mdevolde Date: Fri, 3 Jul 2026 10:46:23 +0300 Subject: [PATCH] fix(server): properly detect remote_server URL scheme instead of substring match --- CHANGELOG.md | 1 + src/language_tool_python/_internals/utils.py | 64 ++++++++-- tests/unit/test_internals_utils.py | 118 ++++++++++++++++++- 3 files changed, 172 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8145c80..bc79086 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/2.0.0/), - Fixed a bug in `LanguageTool._query_server` where `RateLimitError` was only raised when the rate-limit response body was invalid JSON, a valid JSON body with status 426 was silently returned as data instead (for now, the body from LanguageTool for rate-limiting responses is "Upgrade Required", which is not valid JSON, but this may change in the future). - Fixed a bug in `LanguageTool._terminate_server` where `_RUNNING_SERVER_PROCESSES.remove()` could raise `ValueError` if the server process was not yet in the list or was no longer in it. - Fixed a missing warning when downloading a LanguageTool zip file (with `LanguageTool` or `LocalLanguageTool`) without an available SHA-256 checksum. Now a `RuntimeWarning` is emitted in this case. +- Fixed `remote_server` URL scheme detection in `language_tool_python.server.LanguageTool`, which used a fragile substring check (`"http" in remote_server`) and could misidentify a host containing "http" in its name (e.g. `myhttpserver.example.com`) as already having a scheme. `parse_url` now correctly recognises `scheme://` URLs, bare `scheme:` URLs (e.g. `http:example.com`, `ftp:example.com`), and protocol-relative URLs (`//host`). Only `http`/`https` are accepted, raising `ValueError` otherwise (unsupported scheme, empty URL, or missing host), and warning via `RuntimeWarning` when `http://` is added by default. ### Removed - **Breaking:** Removed all functions and classes previously deprecated in v3.3.0: diff --git a/src/language_tool_python/_internals/utils.py b/src/language_tool_python/_internals/utils.py index 0afa271..170ff1a 100644 --- a/src/language_tool_python/_internals/utils.py +++ b/src/language_tool_python/_internals/utils.py @@ -3,9 +3,11 @@ import logging import math import os +import re import urllib.parse from pathlib import Path -from typing import Protocol, runtime_checkable +from typing import Protocol, cast, runtime_checkable +from warnings import warn import psutil @@ -28,22 +30,70 @@ LTP_PATH_ENV_VAR = "LTP_PATH" # LanguageTool download path +_SUPPORTED_URL_SCHEMES = frozenset({"http", "https"}) + +# Matches an explicit "scheme://" prefix, capturing the scheme name. +_URL_SCHEME_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.-]*)://") + +# Matches an explicit "scheme:" prefix that lacks the "//" authority marker +# (RFC 3986's scheme:opaque form, e.g. "ftp:example.com" or "mailto:a@b.com"). +_EXPLICIT_SCHEME_NO_SLASH_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.-]*):(?!\d)") + + +def _check_supported_scheme(scheme: str) -> None: + """Raise ValueError if ``scheme`` is not one of the supported URL schemes.""" + if scheme not in _SUPPORTED_URL_SCHEMES: + err = ( + f"Unsupported URL scheme {scheme!r}. Only 'http' and 'https' are supported." + ) + raise ValueError(err) + def parse_url(url_str: str) -> str: """Parse the given URL string and ensure it has a scheme. - If the input URL string does not contain 'http', 'http://' is prepended to it. The - function then parses the URL and returns its canonical form. - :param url_str: The URL string to be parsed. :type url_str: str :return: The parsed URL in its canonical form. :rtype: str + :raises ValueError: If ``url_str`` is empty (or only whitespace), uses an + unsupported scheme, or has no host. """ - if "http" not in url_str: - url_str = "http://" + url_str + stripped = url_str.strip() + if not stripped: + err = "The URL must not be empty." + raise ValueError(err) + + scheme_match = _URL_SCHEME_RE.match(stripped) + if scheme_match is not None: + # scheme is present and followed by "//" + scheme = cast("str", scheme_match.group(1)).lower() + _check_supported_scheme(scheme) + else: + explicit_scheme_match = _EXPLICIT_SCHEME_NO_SLASH_RE.match(stripped) + if explicit_scheme_match is not None: + # scheme is present but not followed by "//" + scheme = cast("str", explicit_scheme_match.group(1)).lower() + _check_supported_scheme(scheme) + stripped = scheme + stripped[explicit_scheme_match.end(1) :] + else: + # scheme is missing, so we default to "http://" and issue a warning + prefix = "http:" if stripped.startswith("//") else "http://" + stripped = prefix + stripped + warn( + "No scheme was specified in the URL, so the unsecure 'http://' " + "scheme was used by default. It is recommended to explicitly set the " + "protocol ('http://' or 'https://') yourself in the URL.", + RuntimeWarning, + stacklevel=2, + ) + + parsed = urllib.parse.urlparse(stripped) + if not parsed.hostname: + err = f"The URL {stripped!r} must include a host." + raise ValueError(err) - return urllib.parse.urlparse(url_str).geturl() + return parsed.geturl() def get_env_int(env_var: str, default: int) -> int: diff --git a/tests/unit/test_internals_utils.py b/tests/unit/test_internals_utils.py index 28eae3d..fa3df61 100644 --- a/tests/unit/test_internals_utils.py +++ b/tests/unit/test_internals_utils.py @@ -40,15 +40,125 @@ def test_https_url_unchanged(self) -> None: assert parse_url("https://example.com") == "https://example.com" def test_adds_http_scheme(self) -> None: - """A host:port string without a scheme gets http:// prepended.""" - result = parse_url("localhost:8081") - assert result.startswith("http://") - assert "localhost" in result + """A host:port string without a scheme gets http:// prepended and warns.""" + with pytest.warns(RuntimeWarning, match="No scheme was specified"): + result = parse_url("localhost:8081") + assert result == "http://localhost:8081" def test_canonical_form(self) -> None: """An already-complete URL with trailing slash is returned unchanged.""" assert parse_url("http://localhost:8081/") == "http://localhost:8081/" + def test_host_containing_http_substring_is_prefixed(self) -> None: + """A schemeless host containing 'http' in its name still gets prefixed.""" + with pytest.warns(RuntimeWarning, match="No scheme was specified"): + result = parse_url("myhttpserver.example.com") + assert result == "http://myhttpserver.example.com" + + def test_host_with_port_and_path(self) -> None: + """A schemeless host:port/path string is prefixed and preserved.""" + with pytest.warns(RuntimeWarning, match="No scheme was specified"): + result = parse_url("example.com:8081/api") + assert result == "http://example.com:8081/api" + + def test_uppercase_scheme_is_normalised(self) -> None: + """An uppercase scheme is recognised and normalised to lowercase.""" + assert parse_url("HTTP://example.com") == "http://example.com" + + def test_mixed_case_https_scheme_is_normalised(self) -> None: + """A mixed-case https scheme is recognised and normalised to lowercase.""" + assert parse_url("HTTPS://example.com") == "https://example.com" + + def test_strips_surrounding_whitespace(self) -> None: + """Leading and trailing whitespace around the URL is ignored.""" + assert parse_url(" http://example.com ") == "http://example.com" + + def test_empty_string_raises(self) -> None: + """An empty URL string raises ValueError.""" + with pytest.raises(ValueError, match="must not be empty"): + parse_url("") + + def test_whitespace_only_raises(self) -> None: + """A whitespace-only URL string raises ValueError.""" + with pytest.raises(ValueError, match="must not be empty"): + parse_url(" ") + + def test_unsupported_scheme_raises(self) -> None: + """An explicit, unsupported scheme (e.g. ftp) raises ValueError.""" + with pytest.raises(ValueError, match="Unsupported URL scheme 'ftp'"): + parse_url("ftp://example.com") + + def test_file_scheme_raises(self) -> None: + """An explicit file:// scheme raises ValueError instead of passing through.""" + with pytest.raises(ValueError, match="Unsupported URL scheme 'file'"): + parse_url("file:///etc/passwd") + + def test_scheme_without_host_raises(self) -> None: + """A scheme with no host raises ValueError.""" + with pytest.raises(ValueError, match="must include a host"): + parse_url("http://") + + def test_scheme_with_empty_host_and_port_raises(self) -> None: + """A scheme with a port but no host raises ValueError.""" + with pytest.raises(ValueError, match="must include a host"): + parse_url("http://:8081") + + def test_error_message_strips_whitespace(self) -> None: + """The host-missing error message does not leak surrounding whitespace.""" + with pytest.raises( + ValueError, match=r"^The URL 'http://:8081' must include a host\.$" + ): + parse_url(" http://:8081 ") + + def test_protocol_relative_url_gets_http_scheme(self) -> None: + """A protocol-relative URL (//host) is given an http:// scheme, not mangled.""" + with pytest.warns(RuntimeWarning, match="No scheme was specified"): + result = parse_url("//example.com:8081") + assert result == "http://example.com:8081" + + def test_explicit_scheme_without_slashes_raises_clean_message(self) -> None: + """An RFC 3986 scheme:opaque form (no host) raises with a clean message.""" + with pytest.raises( + ValueError, match=r"^The URL 'http:example\.com' must include a host\.$" + ): + parse_url("http:example.com") + + def test_uppercase_explicit_scheme_without_slashes_raises(self) -> None: + """An uppercase scheme:opaque form is treated the same as lowercase.""" + with pytest.raises(ValueError, match="must include a host"): + parse_url("HTTPS:example.com") + + def test_schemeless_host_port_unaffected_by_scheme_without_slash_detection( + self, + ) -> None: + """A schemeless host:port input is still treated as schemeless.""" + with pytest.warns(RuntimeWarning, match="No scheme was specified"): + result = parse_url("localhost:8081") + assert result == "http://localhost:8081" + + def test_schemeless_host_port_path_unaffected_by_scheme_without_slash_detection( + self, + ) -> None: + """A schemeless host:port/path input is still treated as schemeless.""" + with pytest.warns(RuntimeWarning, match="No scheme was specified"): + result = parse_url("example.com:8081/api") + assert result == "http://example.com:8081/api" + + def test_ftp_scheme_without_slashes_raises_unsupported(self) -> None: + """An unsupported scheme:opaque form (ftp:) raises Unsupported URL scheme.""" + with pytest.raises(ValueError, match="Unsupported URL scheme 'ftp'"): + parse_url("ftp:example.com") + + def test_mailto_scheme_without_slashes_raises_unsupported(self) -> None: + """An unsupported scheme:opaque form (mailto:) raises Unsupported URL scheme.""" + with pytest.raises(ValueError, match="Unsupported URL scheme 'mailto'"): + parse_url("mailto:a@b.com") + + def test_ws_scheme_without_slashes_raises_unsupported(self) -> None: + """An unsupported scheme:opaque form (ws:) raises Unsupported URL scheme.""" + with pytest.raises(ValueError, match="Unsupported URL scheme 'ws'"): + parse_url("ws:example.com") + class TestGetEnvInt: """Tests for get_env_int() environment variable reader."""