Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/2.0.0/),
- Fixed a bug in `LanguageTool._query_server` where `RateLimitError` was only raised when the rate-limit response body was invalid JSON, a valid JSON body with status 426 was silently returned as data instead (for now, the body from LanguageTool for rate-limiting responses is "Upgrade Required", which is not valid JSON, but this may change in the future).
- Fixed a bug in `LanguageTool._terminate_server` where `_RUNNING_SERVER_PROCESSES.remove()` could raise `ValueError` if the server process was not yet in the list or was no longer in it.
- Fixed a missing warning when downloading a LanguageTool zip file (with `LanguageTool` or `LocalLanguageTool`) without an available SHA-256 checksum. Now a `RuntimeWarning` is emitted in this case.
- Fixed `remote_server` URL scheme detection in `language_tool_python.server.LanguageTool`, which used a fragile substring check (`"http" in remote_server`) and could misidentify a host containing "http" in its name (e.g. `myhttpserver.example.com`) as already having a scheme. `parse_url` now correctly recognises `scheme://` URLs, bare `scheme:` URLs (e.g. `http:example.com`, `ftp:example.com`), and protocol-relative URLs (`//host`). Only `http`/`https` are accepted, raising `ValueError` otherwise (unsupported scheme, empty URL, or missing host), and warning via `RuntimeWarning` when `http://` is added by default.

### Removed
- **Breaking:** Removed all functions and classes previously deprecated in v3.3.0:
Expand Down
64 changes: 57 additions & 7 deletions src/language_tool_python/_internals/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import logging
import math
import os
import re
import urllib.parse
from pathlib import Path
from typing import Protocol, runtime_checkable
from typing import Protocol, cast, runtime_checkable
from warnings import warn

import psutil

Expand All @@ -28,22 +30,70 @@

LTP_PATH_ENV_VAR = "LTP_PATH" # LanguageTool download path

_SUPPORTED_URL_SCHEMES = frozenset({"http", "https"})

# Matches an explicit "scheme://" prefix, capturing the scheme name.
_URL_SCHEME_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.-]*)://")

# Matches an explicit "scheme:" prefix that lacks the "//" authority marker
# (RFC 3986's scheme:opaque form, e.g. "ftp:example.com" or "mailto:a@b.com").
_EXPLICIT_SCHEME_NO_SLASH_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+.-]*):(?!\d)")


def _check_supported_scheme(scheme: str) -> None:
"""Raise ValueError if ``scheme`` is not one of the supported URL schemes."""
if scheme not in _SUPPORTED_URL_SCHEMES:
err = (
f"Unsupported URL scheme {scheme!r}. Only 'http' and 'https' are supported."
)
raise ValueError(err)


def parse_url(url_str: str) -> str:
"""Parse the given URL string and ensure it has a scheme.

If the input URL string does not contain 'http', 'http://' is prepended to it. The
function then parses the URL and returns its canonical form.

:param url_str: The URL string to be parsed.
:type url_str: str
:return: The parsed URL in its canonical form.
:rtype: str
:raises ValueError: If ``url_str`` is empty (or only whitespace), uses an
unsupported scheme, or has no host.
"""
if "http" not in url_str:
url_str = "http://" + url_str
stripped = url_str.strip()
if not stripped:
err = "The URL must not be empty."
raise ValueError(err)

scheme_match = _URL_SCHEME_RE.match(stripped)
if scheme_match is not None:
# scheme is present and followed by "//"
scheme = cast("str", scheme_match.group(1)).lower()
_check_supported_scheme(scheme)
else:
explicit_scheme_match = _EXPLICIT_SCHEME_NO_SLASH_RE.match(stripped)
if explicit_scheme_match is not None:
# scheme is present but not followed by "//"
scheme = cast("str", explicit_scheme_match.group(1)).lower()
_check_supported_scheme(scheme)
stripped = scheme + stripped[explicit_scheme_match.end(1) :]
else:
# scheme is missing, so we default to "http://" and issue a warning
prefix = "http:" if stripped.startswith("//") else "http://"
stripped = prefix + stripped
warn(
"No scheme was specified in the URL, so the unsecure 'http://' "
"scheme was used by default. It is recommended to explicitly set the "
"protocol ('http://' or 'https://') yourself in the URL.",
RuntimeWarning,
stacklevel=2,
)

parsed = urllib.parse.urlparse(stripped)
if not parsed.hostname:
err = f"The URL {stripped!r} must include a host."
raise ValueError(err)

return urllib.parse.urlparse(url_str).geturl()
return parsed.geturl()


def get_env_int(env_var: str, default: int) -> int:
Expand Down
118 changes: 114 additions & 4 deletions tests/unit/test_internals_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,125 @@ def test_https_url_unchanged(self) -> None:
assert parse_url("https://example.com") == "https://example.com"

def test_adds_http_scheme(self) -> None:
"""A host:port string without a scheme gets http:// prepended."""
result = parse_url("localhost:8081")
assert result.startswith("http://")
assert "localhost" in result
"""A host:port string without a scheme gets http:// prepended and warns."""
with pytest.warns(RuntimeWarning, match="No scheme was specified"):
result = parse_url("localhost:8081")
assert result == "http://localhost:8081"

def test_canonical_form(self) -> None:
"""An already-complete URL with trailing slash is returned unchanged."""
assert parse_url("http://localhost:8081/") == "http://localhost:8081/"

def test_host_containing_http_substring_is_prefixed(self) -> None:
"""A schemeless host containing 'http' in its name still gets prefixed."""
with pytest.warns(RuntimeWarning, match="No scheme was specified"):
result = parse_url("myhttpserver.example.com")
assert result == "http://myhttpserver.example.com"

def test_host_with_port_and_path(self) -> None:
"""A schemeless host:port/path string is prefixed and preserved."""
with pytest.warns(RuntimeWarning, match="No scheme was specified"):
result = parse_url("example.com:8081/api")
assert result == "http://example.com:8081/api"

def test_uppercase_scheme_is_normalised(self) -> None:
"""An uppercase scheme is recognised and normalised to lowercase."""
assert parse_url("HTTP://example.com") == "http://example.com"

def test_mixed_case_https_scheme_is_normalised(self) -> None:
"""A mixed-case https scheme is recognised and normalised to lowercase."""
assert parse_url("HTTPS://example.com") == "https://example.com"

def test_strips_surrounding_whitespace(self) -> None:
"""Leading and trailing whitespace around the URL is ignored."""
assert parse_url(" http://example.com ") == "http://example.com"

def test_empty_string_raises(self) -> None:
"""An empty URL string raises ValueError."""
with pytest.raises(ValueError, match="must not be empty"):
parse_url("")

def test_whitespace_only_raises(self) -> None:
"""A whitespace-only URL string raises ValueError."""
with pytest.raises(ValueError, match="must not be empty"):
parse_url(" ")

def test_unsupported_scheme_raises(self) -> None:
"""An explicit, unsupported scheme (e.g. ftp) raises ValueError."""
with pytest.raises(ValueError, match="Unsupported URL scheme 'ftp'"):
parse_url("ftp://example.com")

def test_file_scheme_raises(self) -> None:
"""An explicit file:// scheme raises ValueError instead of passing through."""
with pytest.raises(ValueError, match="Unsupported URL scheme 'file'"):
parse_url("file:///etc/passwd")

def test_scheme_without_host_raises(self) -> None:
"""A scheme with no host raises ValueError."""
with pytest.raises(ValueError, match="must include a host"):
parse_url("http://")

def test_scheme_with_empty_host_and_port_raises(self) -> None:
"""A scheme with a port but no host raises ValueError."""
with pytest.raises(ValueError, match="must include a host"):
parse_url("http://:8081")

def test_error_message_strips_whitespace(self) -> None:
"""The host-missing error message does not leak surrounding whitespace."""
with pytest.raises(
ValueError, match=r"^The URL 'http://:8081' must include a host\.$"
):
parse_url(" http://:8081 ")

def test_protocol_relative_url_gets_http_scheme(self) -> None:
"""A protocol-relative URL (//host) is given an http:// scheme, not mangled."""
with pytest.warns(RuntimeWarning, match="No scheme was specified"):
result = parse_url("//example.com:8081")
assert result == "http://example.com:8081"

def test_explicit_scheme_without_slashes_raises_clean_message(self) -> None:
"""An RFC 3986 scheme:opaque form (no host) raises with a clean message."""
with pytest.raises(
ValueError, match=r"^The URL 'http:example\.com' must include a host\.$"
):
parse_url("http:example.com")

def test_uppercase_explicit_scheme_without_slashes_raises(self) -> None:
"""An uppercase scheme:opaque form is treated the same as lowercase."""
with pytest.raises(ValueError, match="must include a host"):
parse_url("HTTPS:example.com")

def test_schemeless_host_port_unaffected_by_scheme_without_slash_detection(
self,
) -> None:
"""A schemeless host:port input is still treated as schemeless."""
with pytest.warns(RuntimeWarning, match="No scheme was specified"):
result = parse_url("localhost:8081")
assert result == "http://localhost:8081"

def test_schemeless_host_port_path_unaffected_by_scheme_without_slash_detection(
self,
) -> None:
"""A schemeless host:port/path input is still treated as schemeless."""
with pytest.warns(RuntimeWarning, match="No scheme was specified"):
result = parse_url("example.com:8081/api")
assert result == "http://example.com:8081/api"

def test_ftp_scheme_without_slashes_raises_unsupported(self) -> None:
"""An unsupported scheme:opaque form (ftp:) raises Unsupported URL scheme."""
with pytest.raises(ValueError, match="Unsupported URL scheme 'ftp'"):
parse_url("ftp:example.com")

def test_mailto_scheme_without_slashes_raises_unsupported(self) -> None:
"""An unsupported scheme:opaque form (mailto:) raises Unsupported URL scheme."""
with pytest.raises(ValueError, match="Unsupported URL scheme 'mailto'"):
parse_url("mailto:a@b.com")

def test_ws_scheme_without_slashes_raises_unsupported(self) -> None:
"""An unsupported scheme:opaque form (ws:) raises Unsupported URL scheme."""
with pytest.raises(ValueError, match="Unsupported URL scheme 'ws'"):
parse_url("ws:example.com")


class TestGetEnvInt:
"""Tests for get_env_int() environment variable reader."""
Expand Down
Loading