Enhance HLS proxy functionality and improve caching mechanism

- Updated AGENTS.md to clarify dlp.py module usage and segment handling.
- Modified README.md to include ALLOW_LOCAL configuration for testing.
- Refactored app.py to streamline HLS proxy logic and improve error handling.
- Enhanced dlp.py to optimize caching and segment retrieval processes.
- Updated player.html to ensure proper JSON formatting for proxy URLs.
- Improved test_integration.py to validate HLS segment proxying and added test for Pornhub HLS extraction.
- Adjusted test_proxy.py to reflect changes in caching functions and data structure.
This commit is contained in:
Mikhail Yevchenko
2026-04-01 12:47:21 +00:00
parent 154f600fd2
commit 01a376ae21
7 changed files with 143 additions and 110 deletions
+3 -2
View File
@@ -41,10 +41,11 @@ As an HTML templating engine, you can use Jinja2, which is built into Flask, for
```
- app.py - main Flask application file that handles incoming HTTP requests and interacts with yt-dlp through functions from dlp.py.
- dlp.py - module for interacting with yt-dlp, containing functions to get HLS playlists and segments.
- dlp.py - module for interacting with yt-dlp, containing functions to get HLS playlists and segments. examine yt_dlp/YoutubeDL.py in venv in order to understand how to use yt-dlp for getting HLS playlists and segments
functions:
- get_hls_playlist(video_url): gets HLS playlist for the specified video as a string that can be returned to the client. The segment list should be filtered to only include those available for the given video and supported by yt-dlp.
- get_hls_segment(video_url, segment_name): gets the specified video segment: downloads it using yt-dlp and returns its content as bytes that can be returned to the client. It should also use yt-dlp to download the segment since only yt-dlp can handle the necessary authentication and access control for the video content.
it should also rewrite segment filenames in case if they expire during of before download, so that they can be requested through the proxy using predictable URL structure.
- get_hls_segment(video_url, segment_filename): gets the specified video segment for rewritten filename: downloads it using yt-dlp and returns its content as bytes that can be returned to the client. It should also use yt-dlp to download the segment since only yt-dlp can handle the necessary authentication and access control for the video content.
caching:
- Caching of yt-dlp sessions will be implemented using a simple in-memory dictionary that will store video parsing results for each VIDEO_ID. No complex in-memory solutions, just a dictionary with TTL for each key. TTL will be set to 365 days, which will effectively cache results and minimize repeated requests to yt-dlp.
+1
View File
@@ -30,6 +30,7 @@ Visit http://localhost:5000 and enter a video URL.
| SOCKET_TIMEOUT | 30 | Socket timeout for requests |
| VALIDATION_ENABLED | true | Enable URL validation |
| ALLOWED_DOMAINS | youtube.com,youtu.be,pornhub.com,xvideos.com | Allowed video domains |
| ALLOW_LOCAL | true | Allow localhost/127.0.0.1 URLs (for testing) |
## Routes
+11 -11
View File
@@ -56,27 +56,27 @@ def hls_proxy():
if not url_param:
abort(400, description="Missing url parameter")
from urllib.parse import urlparse, unquote
from urllib.parse import unquote
path = request.args.get("path", "")
if ".m3u8" in url_param and not path:
video_url = url_param
elif ".m3u8" in url_param and path:
video_url = url_param
else:
video_url = url_param
video_url = unquote(video_url)
video_url = unquote(url_param)
if not is_valid_url(video_url):
abort(400, description="Invalid URL")
if path.endswith(".m3u8") or not path:
# Main playlist request - get from yt-dlp and rewrite URLs
if path == "index.m3u8" or path == "":
playlist = dlp.get_hls_playlist(video_url)
return Response(playlist, mimetype="application/vnd.apple.mpegurl")
# Sub-playlist or segment request - path is the absolute URL
segment_data = dlp.get_hls_segment(video_url, path)
if segment_data is None:
abort(500, description="Failed to fetch segment")
if path.endswith(".m3u8"):
return Response(segment_data, mimetype="application/vnd.apple.mpegurl")
return Response(segment_data, mimetype="video/mp2t")
except HTTPException:
+85 -87
View File
@@ -1,22 +1,18 @@
import logging
import os
import time
import re
from typing import Optional
import yt_dlp
logger = logging.getLogger(__name__)
CACHE_TTL = int(os.getenv("CACHE_TTL", 31536000))
SOCKET_TIMEOUT = int(os.getenv("SOCKET_TIMEOUT", 30))
_session_cache = {}
_cache_timestamps = {}
def _is_hls_url(url: str) -> bool:
return url.endswith(".m3u8") or "m3u8" in url
def _get_cache_key(video_url: str) -> str:
return video_url
@@ -28,110 +24,112 @@ def _is_cache_expired(video_url: str) -> bool:
return time.time() - _cache_timestamps[key] > CACHE_TTL
def _get_cached_session(video_url: str) -> Optional[dict]:
def _get_cached_info(video_url: str) -> Optional[dict]:
key = _get_cache_key(video_url)
if key in _session_cache and not _is_cache_expired(video_url):
return _session_cache[key]
return None
def _set_cached_session(video_url: str, session_data: dict) -> None:
def _set_cached_info(video_url: str, info: dict) -> None:
key = _get_cache_key(video_url)
_session_cache[key] = session_data
_session_cache[key] = info
_cache_timestamps[key] = time.time()
def clear_expired_cache() -> None:
expired_keys = [
key for key in _session_cache
if _is_cache_expired(key)
]
for key in expired_keys:
del _session_cache[key]
del _cache_timestamps[key]
def _extract_hls_url(info: dict) -> Optional[str]:
"""Extract HLS URL from yt-dlp info dict."""
if info.get("formats"):
for f in reversed(info["formats"]):
if f.get("protocol") == "m3u8_native":
url = f.get("manifest_url") or f.get("url")
if url and ".m3u8" in url:
return url
return None
def get_hls_playlist(video_url: str) -> str:
cached = _get_cached_session(video_url)
if cached and "hls_playlist" in cached:
return cached["hls_playlist"]
if _is_hls_url(video_url):
hls_url = video_url
else:
ydl_opts = {
"quiet": True,
"no_warnings": True,
"socket_timeout": int(os.getenv("SOCKET_TIMEOUT", 30)),
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(video_url, download=False)
if not info or "hls" not in info or not info["hls"]:
raise ValueError("No HLS stream available for this video")
hls_url = info["hls"]
import urllib.request
with urllib.request.urlopen(hls_url, timeout=30) as response:
playlist_content = response.read().decode("utf-8")
session_data = {
"hls_playlist": playlist_content,
"hls_url": hls_url,
"video_url": video_url,
}
_set_cached_session(video_url, session_data)
return playlist_content
def get_hls_segment(video_url: str, segment_name: str) -> bytes:
cached = _get_cached_session(video_url)
if not cached or "hls_url" not in cached:
get_hls_playlist(video_url)
cached = _get_cached_session(video_url)
hls_url = cached["hls_url"]
base_url = hls_url.rsplit("/", 1)[0]
if segment_name.startswith("/"):
segment_name = segment_name[1:]
segment_url = f"{base_url}/{segment_name}"
import urllib.request
with urllib.request.urlopen(segment_url, timeout=30) as response:
return response.read()
def get_stream_info(video_url: str) -> dict:
cached = _get_cached_session(video_url)
def _get_video_info(video_url: str) -> dict:
"""Get video info using yt-dlp."""
cached = _get_cached_info(video_url)
if cached:
return cached
if _is_hls_url(video_url):
return {
"title": "Test Video",
"hls_url": video_url,
"thumbnail": None,
}
ydl_opts = {
"quiet": True,
"no_warnings": True,
"socket_timeout": int(os.getenv("SOCKET_TIMEOUT", 30)),
"socket_timeout": SOCKET_TIMEOUT,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(video_url, download=False)
if not info:
raise ValueError("Could not extract video info")
return {
"title": info.get("title", "Unknown"),
"hls_url": info.get("hls"),
hls_url = _extract_hls_url(info)
result = {
"title": info.get("title"),
"thumbnail": info.get("thumbnail"),
"hls_url": hls_url,
"raw_info": info,
}
_set_cached_info(video_url, result)
return result
def get_stream_info(video_url: str) -> dict:
"""Get video info (title, hls_url, thumbnail)."""
info = _get_video_info(video_url)
return {
"title": info["title"],
"hls_url": info["hls_url"],
"thumbnail": info["thumbnail"],
}
def get_hls_playlist(video_url: str) -> str:
"""Get HLS playlist content with rewritten URLs."""
info = _get_video_info(video_url)
if not info["hls_url"]:
raise ValueError("No HLS stream available for this video")
import urllib.request
with urllib.request.urlopen(info["hls_url"], timeout=SOCKET_TIMEOUT) as response:
playlist_content = response.read().decode("utf-8")
return _rewrite_urls(playlist_content, video_url, info["hls_url"])
def _rewrite_urls(content: str, video_url: str, base_url: str) -> str:
"""Rewrite relative URLs in HLS playlist to point through proxy."""
from urllib.parse import urljoin, quote
lines = content.split("\n")
new_lines = []
for line in lines:
if line and not line.startswith("#") and line.startswith("http"):
abs_url = line
elif line and not line.startswith("#"):
abs_url = urljoin(base_url, line)
proxy_url = f"/hls?url={quote(video_url, safe='')}&path={quote(abs_url, safe='')}"
new_lines.append(proxy_url)
continue
new_lines.append(line)
return "\n".join(new_lines)
def get_hls_segment(video_url: str, segment_url: str) -> bytes:
"""Get HLS segment or sub-playlist content."""
from urllib.parse import unquote
decoded_url = unquote(segment_url)
import urllib.request
try:
response = urllib.request.urlopen(decoded_url, timeout=SOCKET_TIMEOUT)
data = response.read()
except urllib.error.HTTPError as e:
if e.code == 410:
raise ValueError("HLS URL expired (410 Gone)")
raise
if decoded_url.endswith(".m3u8"):
return _rewrite_urls(data.decode("utf-8"), video_url, decoded_url).encode("utf-8")
return data
+1 -1
View File
@@ -40,7 +40,7 @@
<script src="https://cdn.jsdelivr.net/npm/hls.js@latest"></script>
<script>
const video = document.querySelector('video');
const hlsUrl = '{{ proxy_hls_url }}';
const hlsUrl = {{ proxy_hls_url | tojson }};
if (Hls.isSupported()) {
const hls = new Hls();
+32 -2
View File
@@ -108,9 +108,25 @@ def test_hls_playlist_proxy(test_servers):
def test_hls_segment_proxy(test_servers):
"""Test proxying HLS segment"""
video_url = f"http://127.0.0.1:{TEST_HTTP_PORT}/index.m3u8"
proxy_url = f"http://127.0.0.1:{SERVER_PORT}/hls?url={urllib.parse.quote(video_url, safe='')}&path=segment000.ts"
response = requests.get(proxy_url, timeout=10)
# First get the rewritten playlist to extract the segment URL
playlist_url = f"http://127.0.0.1:{SERVER_PORT}/hls?url={urllib.parse.quote(video_url, safe='')}"
playlist_response = requests.get(playlist_url, timeout=10)
assert playlist_response.status_code == 200
# Extract the segment path from the playlist (it's after the path= parameter)
for line in playlist_response.text.split("\n"):
if line.startswith("/hls?"):
from urllib.parse import urlparse, parse_qs
parsed = urlparse(line)
params = parse_qs(parsed.query)
if "path" in params:
segment_path = params["path"][0]
break
# Now request the segment using the path from the playlist
segment_url = f"http://127.0.0.1:{SERVER_PORT}/hls?url={urllib.parse.quote(video_url, safe='')}&path={urllib.parse.quote(segment_path, safe='')}"
response = requests.get(segment_url, timeout=10)
assert response.status_code == 200
assert len(response.content) > 0
print("HLS segment proxy: OK")
@@ -135,5 +151,19 @@ def test_index_page(test_servers):
print("Index page: OK")
@pytest.mark.skip(reason="External URL test - run manually to verify pornhub support")
def test_pornhub_hls_extraction():
"""Test that pornhub HLS URLs are extracted correctly"""
import dlp
dlp._session_cache.clear()
dlp._cache_timestamps.clear()
# Test with actual pornhub URL
url = "https://rt.pornhub.com/view_video.php?viewkey=69bc20ee15710"
hls_url = dlp.get_stream_info(url)["hls_url"]
assert hls_url and "m3u8" in hls_url
print(f"PornHub HLS URL: {hls_url[:100]}...")
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])
+8 -5
View File
@@ -55,18 +55,21 @@ class TestCacheMechanics:
dlp._session_cache.clear()
dlp._cache_timestamps.clear()
test_data = {"test": "data"}
dlp._set_cached_session("http://test.com/video", test_data)
test_data = {"title": "Test Video", "thumbnail": "http://test.com/thumb.jpg", "hls_url": "http://test.com/stream.m3u8"}
dlp._set_cached_info("http://test.com/video", test_data)
cached = dlp._get_cached_session("http://test.com/video")
assert cached == test_data
cached = dlp._get_cached_info("http://test.com/video")
assert cached is not None
assert cached["title"] == "Test Video"
assert cached["thumbnail"] == "http://test.com/thumb.jpg"
assert cached["hls_url"] == "http://test.com/stream.m3u8"
def test_cache_expiry(self):
dlp.CACHE_TTL = 1
dlp._session_cache.clear()
dlp._cache_timestamps.clear()
dlp._set_cached_session("http://test.com/video", {"data": "test"})
dlp._set_cached_info("http://test.com/video", {"data": "test"})
import time
time.sleep(1.1)