/
/
/
1"""Internet Archive music provider implementation."""
2
3from __future__ import annotations
4
5import contextlib
6import re
7from collections.abc import AsyncGenerator
8from typing import TYPE_CHECKING, Any
9
10import aiohttp
11from music_assistant_models.enums import MediaType, ProviderFeature
12from music_assistant_models.errors import InvalidDataError, MediaNotFoundError
13from music_assistant_models.media_items import (
14 Album,
15 Artist,
16 Audiobook,
17 MediaItemChapter,
18 Podcast,
19 PodcastEpisode,
20 ProviderMapping,
21 SearchResults,
22 Track,
23)
24from music_assistant_models.unique_list import UniqueList
25
26from music_assistant.constants import UNKNOWN_ARTIST
27from music_assistant.controllers.cache import use_cache
28from music_assistant.helpers.throttle_retry import ThrottlerManager, throttle_with_retries
29from music_assistant.models.music_provider import MusicProvider
30
31from .helpers import InternetArchiveClient, clean_text, extract_year, parse_duration
32from .parsers import (
33 add_item_image,
34 artist_exists,
35 create_artist,
36 create_provider_mapping,
37 create_title_from_identifier,
38 doc_to_album,
39 doc_to_audiobook,
40 doc_to_podcast,
41 doc_to_track,
42 is_audiobook_content,
43 is_likely_album,
44 is_podcast_content,
45)
46from .streaming import InternetArchiveStreaming
47
48if TYPE_CHECKING:
49 from music_assistant_models.config_entries import ProviderConfig
50 from music_assistant_models.provider import ProviderManifest
51 from music_assistant_models.streamdetails import StreamDetails
52
53 from music_assistant import MusicAssistant
54
55
56class InternetArchiveProvider(MusicProvider):
57 """Implementation of Internet Archive music provider."""
58
59 def __init__(
60 self,
61 mass: MusicAssistant,
62 manifest: ProviderManifest,
63 config: ProviderConfig,
64 supported_features: set[ProviderFeature],
65 ) -> None:
66 """Initialize the provider."""
67 super().__init__(mass, manifest, config, supported_features)
68 self.throttler = ThrottlerManager(
69 rate_limit=10, period=60, retry_attempts=5, initial_backoff=5
70 )
71 self.client = InternetArchiveClient(mass)
72 self.streaming = InternetArchiveStreaming(self)
73
74 @property
75 def is_streaming_provider(self) -> bool:
76 """Return True if provider is a streaming provider."""
77 return True
78
79 @throttle_with_retries
80 async def _get_json(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
81 """Make a GET request and return JSON response with throttling."""
82 return await self.client._get_json(url, params)
83
84 @throttle_with_retries
85 async def _search(self, **kwargs: Any) -> dict[str, Any]:
86 """Throttled search wrapper."""
87 return await self.client.search(**kwargs)
88
89 @throttle_with_retries
90 async def _get_metadata(self, identifier: str) -> dict[str, Any]:
91 """Throttled metadata wrapper."""
92 return await self.client.get_metadata(identifier)
93
94 @throttle_with_retries
95 @use_cache(expiration=86400 * 30) # 30 days - file listings are static
96 async def _get_audio_files(self, identifier: str) -> list[dict[str, Any]]:
97 """Throttled audio files wrapper."""
98 return await self.client.get_audio_files(identifier)
99
100 @use_cache(86400 * 7) # 7 days
101 async def search(
102 self,
103 search_query: str,
104 media_types: list[MediaType],
105 limit: int = 5,
106 ) -> SearchResults:
107 """
108 Perform search on Internet Archive.
109
110 Uses multiple search strategies to maximize result coverage with
111 proper result accumulation and broader search patterns.
112
113 Args:
114 search_query: The search term to look for
115 media_types: List of media types to search for
116 limit: Maximum number of results to return per media type
117
118 Returns:
119 SearchResults object containing found items
120 """
121 if not search_query.strip():
122 return SearchResults()
123
124 # Adjust search intensity based on what's being requested
125 rows_per_strategy = min(limit * 2, 16) if len(media_types) > 1 else min(limit * 2, 100)
126
127 # Collect results in separate lists
128 tracks: list[Track] = []
129 albums: list[Album] = []
130 artists: list[Artist] = []
131 audiobooks: list[Audiobook] = []
132 podcasts: list[Podcast] = []
133
134 # Track processed identifiers to avoid duplicates across strategies
135 processed_ids: set[str] = set()
136
137 # Build search strategies based on requested media types
138 search_strategies = []
139
140 # For music searches: focus on title and creator
141 if any(mt in media_types for mt in [MediaType.TRACK, MediaType.ALBUM, MediaType.ARTIST]):
142 search_strategies.extend(
143 [
144 (f"creator:({search_query}) AND mediatype:audio", "downloads desc"),
145 (f"title:({search_query}) AND mediatype:audio", "downloads desc"),
146 (f"subject:({search_query}) AND mediatype:audio", "downloads desc"),
147 ]
148 )
149
150 # For audiobooks: search within audiobook collections, still limit to audio
151 if MediaType.AUDIOBOOK in media_types:
152 audiobook_query = f"{search_query} AND collection:(librivoxaudio OR audio_bookspoetry) AND mediatype:audio" # noqa: E501
153 search_strategies.append((audiobook_query, "downloads desc"))
154
155 # For podcasts: search within podcast collections
156 if MediaType.PODCAST in media_types:
157 podcast_query = f"{search_query} AND collection:podcasts AND mediatype:audio"
158 search_strategies.append((podcast_query, "downloads desc"))
159
160 for strategy_idx, (strategy_query, sort_order) in enumerate(search_strategies):
161 self.logger.debug("Trying search strategy %d: %s", strategy_idx + 1, strategy_query)
162
163 try:
164 search_response = await self._search(
165 query=strategy_query,
166 rows=rows_per_strategy,
167 sort=sort_order,
168 )
169
170 response_data = search_response.get("response", {})
171 docs = response_data.get("docs", [])
172 self.logger.debug(
173 "Strategy %d '%s' found %d raw results",
174 strategy_idx + 1,
175 strategy_query,
176 len(docs),
177 )
178
179 # Process results and extract different media types
180 strategy_processed = 0
181 strategy_skipped = 0
182
183 for doc in docs:
184 try:
185 identifier = doc.get("identifier")
186 if not identifier or identifier in processed_ids:
187 strategy_skipped += 1
188 continue
189
190 # Track this identifier to avoid duplicates
191 processed_ids.add(identifier)
192
193 await self._process_search_result(
194 doc, tracks, albums, artists, audiobooks, podcasts, media_types
195 )
196 strategy_processed += 1
197
198 # Check if we have enough results across all types
199 if self._has_sufficient_results(
200 tracks, albums, artists, audiobooks, podcasts, media_types, limit
201 ):
202 self.logger.debug(
203 "Sufficient results found after strategy %d, stopping search",
204 strategy_idx + 1,
205 )
206 break
207
208 except (InvalidDataError, KeyError) as err:
209 self.logger.debug("Skipping invalid search result: %s", err)
210 strategy_skipped += 1
211 continue
212
213 self.logger.debug(
214 "Strategy %d '%s': processed %d new items, skipped %d items. "
215 "Running totals - tracks: %d, albums: %d, artists: %d, "
216 "audiobooks: %d, podcasts: %d",
217 strategy_idx + 1,
218 strategy_query,
219 strategy_processed,
220 strategy_skipped,
221 len(tracks),
222 len(albums),
223 len(artists),
224 len(audiobooks),
225 len(podcasts),
226 )
227
228 # If we have sufficient results, stop trying more strategies
229 if self._has_sufficient_results(
230 tracks, albums, artists, audiobooks, podcasts, media_types, limit
231 ):
232 break
233
234 except Exception as err:
235 self.logger.warning("Search strategy %d failed: %s", strategy_idx + 1, err)
236 continue
237
238 # Log final results for debugging
239 self.logger.debug(
240 "Search for '%s' completed. Final results - tracks: %d, albums: %d, "
241 "artists: %d, audiobooks: %d, podcasts: %d (processed %d unique items)",
242 search_query,
243 len(tracks),
244 len(albums),
245 len(artists),
246 len(audiobooks),
247 len(podcasts),
248 len(processed_ids),
249 )
250
251 return SearchResults(
252 tracks=tracks[:limit] if MediaType.TRACK in media_types else [],
253 albums=albums[:limit] if MediaType.ALBUM in media_types else [],
254 artists=artists[:limit] if MediaType.ARTIST in media_types else [],
255 audiobooks=audiobooks[:limit] if MediaType.AUDIOBOOK in media_types else [],
256 podcasts=podcasts[:limit] if MediaType.PODCAST in media_types else [],
257 )
258
259 def _has_sufficient_results(
260 self,
261 tracks: list[Track],
262 albums: list[Album],
263 artists: list[Artist],
264 audiobooks: list[Audiobook],
265 podcasts: list[Podcast],
266 media_types: list[MediaType],
267 limit: int,
268 ) -> bool:
269 """Check if we have sufficient results for all requested media types."""
270 return (
271 (MediaType.TRACK not in media_types or len(tracks) >= limit)
272 and (MediaType.ALBUM not in media_types or len(albums) >= limit)
273 and (MediaType.ARTIST not in media_types or len(artists) >= limit)
274 and (MediaType.AUDIOBOOK not in media_types or len(audiobooks) >= limit)
275 and (MediaType.PODCAST not in media_types or len(podcasts) >= limit)
276 )
277
278 async def _process_search_result(
279 self,
280 doc: dict[str, Any],
281 tracks: list[Track],
282 albums: list[Album],
283 artists: list[Artist],
284 audiobooks: list[Audiobook],
285 podcasts: list[Podcast],
286 media_types: list[MediaType],
287 ) -> None:
288 """
289 Process a single search result document from Internet Archive.
290
291 Determines the appropriate media type and creates corresponding objects.
292 Uses improved heuristics to classify items as tracks, albums, or audiobooks.
293 """
294 identifier = doc.get("identifier")
295 if not identifier:
296 raise InvalidDataError("Missing identifier in search result")
297
298 title = clean_text(doc.get("title"))
299 creator = clean_text(doc.get("creator"))
300
301 # Be lenient - allow items without title if they have identifier
302 if not title and not identifier:
303 raise InvalidDataError("Missing both title and identifier in search result")
304
305 # Use identifier as fallback title if needed
306 if not title:
307 title = create_title_from_identifier(identifier)
308
309 # Determine what type of item this is
310 mediatype = doc.get("mediatype", "")
311 collection = doc.get("collection", [])
312 if isinstance(collection, str):
313 collection = [collection]
314
315 # Check if this is audiobook content using improved detection
316 if is_audiobook_content(doc) and MediaType.AUDIOBOOK in media_types:
317 audiobook = doc_to_audiobook(
318 doc, self.domain, self.instance_id, self.client.get_item_url
319 )
320 if audiobook:
321 audiobooks.append(audiobook)
322 return # Don't process as other media types
323
324 # Check if this is podcast content
325 if is_podcast_content(doc) and MediaType.PODCAST in media_types:
326 podcast = doc_to_podcast(doc, self.domain, self.instance_id, self.client.get_item_url)
327 if podcast:
328 podcasts.append(podcast)
329 return # Don't process as other media types
330
331 # For etree items, usually each item is an album (concert)
332 if mediatype == "etree" or "etree" in collection:
333 if MediaType.ALBUM in media_types:
334 album = doc_to_album(doc, self.domain, self.instance_id, self.client.get_item_url)
335 if album:
336 albums.append(album)
337
338 if MediaType.ARTIST in media_types and creator:
339 artist = create_artist(creator, self.domain, self.instance_id)
340 if artist and not artist_exists(artist, artists):
341 artists.append(artist)
342
343 elif mediatype == "audio":
344 # Use heuristics to determine album vs track without expensive API calls
345 if is_likely_album(doc):
346 if MediaType.ALBUM in media_types:
347 album = doc_to_album(
348 doc, self.domain, self.instance_id, self.client.get_item_url
349 )
350 if album:
351 albums.append(album)
352 elif MediaType.TRACK in media_types:
353 track = doc_to_track(doc, self.domain, self.instance_id, self.client.get_item_url)
354 if track:
355 tracks.append(track)
356
357 if MediaType.ARTIST in media_types and creator:
358 artist = create_artist(creator, self.domain, self.instance_id)
359 if artist and not artist_exists(artist, artists):
360 artists.append(artist)
361
362 @use_cache(expiration=86400 * 60) # Cache for 60 days - artist "tracks" change infrequently
363 async def get_track(self, prov_track_id: str) -> Track:
364 """Get full track details by id."""
365 metadata = await self._get_metadata(prov_track_id)
366 item_metadata = metadata.get("metadata", {})
367
368 title = clean_text(item_metadata.get("title"))
369 creator = clean_text(item_metadata.get("creator"))
370
371 if not title:
372 raise MediaNotFoundError(f"Track {prov_track_id} not found or invalid")
373
374 track = Track(
375 item_id=prov_track_id,
376 provider=self.instance_id,
377 name=title,
378 provider_mappings={
379 create_provider_mapping(
380 prov_track_id, self.domain, self.instance_id, self.client.get_item_url
381 )
382 },
383 )
384
385 # Add artist
386 if creator:
387 track.artists = UniqueList([create_artist(creator, self.domain, self.instance_id)])
388 else:
389 track.artists = UniqueList(
390 [create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)]
391 )
392
393 # Add duration from first audio file
394 try:
395 audio_files = await self._get_audio_files(prov_track_id)
396 if audio_files and audio_files[0].get("length"):
397 duration = parse_duration(audio_files[0]["length"])
398 if duration:
399 track.duration = duration
400 except (TimeoutError, aiohttp.ClientError) as err:
401 self.logger.debug("Network error getting duration for track %s: %s", prov_track_id, err)
402 except (KeyError, ValueError, TypeError) as err:
403 self.logger.debug("Could not parse duration for track %s: %s", prov_track_id, err)
404
405 # Add metadata
406 if description := clean_text(item_metadata.get("description")):
407 track.metadata.description = description
408
409 # Add thumbnail
410 add_item_image(track, prov_track_id, self.instance_id)
411
412 return track
413
414 @use_cache(expiration=86400 * 60) # Cache for 60 days - album catalogs change infrequently
415 async def get_album(self, prov_album_id: str) -> Album:
416 """Get full album details by id."""
417 metadata = await self._get_metadata(prov_album_id)
418 item_metadata = metadata.get("metadata", {})
419
420 title = clean_text(item_metadata.get("title"))
421 creator = clean_text(item_metadata.get("creator"))
422
423 if not title:
424 raise MediaNotFoundError(f"Album {prov_album_id} not found or invalid")
425
426 album = Album(
427 item_id=prov_album_id,
428 provider=self.instance_id,
429 name=title,
430 provider_mappings={
431 create_provider_mapping(
432 prov_album_id, self.domain, self.instance_id, self.client.get_item_url
433 )
434 },
435 )
436
437 # Add artist
438 if creator:
439 album.artists = UniqueList([create_artist(creator, self.domain, self.instance_id)])
440 else:
441 album.artists = UniqueList(
442 [create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)]
443 )
444
445 # Add metadata
446 if date := extract_year(item_metadata.get("date")):
447 album.year = date
448
449 if description := clean_text(item_metadata.get("description")):
450 album.metadata.description = description
451
452 # Add thumbnail
453 add_item_image(album, prov_album_id, self.instance_id)
454
455 return album
456
457 @use_cache(expiration=86400 * 60) # Cache for 60 days - artist catalogs change infrequently
458 async def get_artist(self, prov_artist_id: str) -> Artist:
459 """
460 Get full artist details by id.
461
462 Args:
463 prov_artist_id: Provider-specific artist identifier (artist name)
464
465 Returns:
466 Artist object
467 """
468 # Artist IDs are just the creator names
469 return Artist(
470 item_id=prov_artist_id,
471 provider=self.instance_id,
472 name=prov_artist_id,
473 provider_mappings={
474 ProviderMapping(
475 item_id=prov_artist_id,
476 provider_domain=self.domain,
477 provider_instance=self.instance_id,
478 )
479 },
480 )
481
482 @use_cache(expiration=86400 * 30) # Cache for 30 days - audiobook catalogs change infrequently
483 async def get_audiobook(self, prov_audiobook_id: str) -> Audiobook:
484 """Get full audiobook details by id."""
485 metadata = await self._get_metadata(prov_audiobook_id)
486 item_metadata = metadata.get("metadata", {})
487
488 title = clean_text(item_metadata.get("title"))
489 creator = clean_text(item_metadata.get("creator"))
490
491 if not title:
492 raise MediaNotFoundError(f"Audiobook {prov_audiobook_id} not found or invalid")
493
494 audiobook = Audiobook(
495 item_id=prov_audiobook_id,
496 provider=self.instance_id,
497 name=title,
498 provider_mappings={
499 create_provider_mapping(
500 prov_audiobook_id, self.domain, self.instance_id, self.client.get_item_url
501 )
502 },
503 )
504
505 # Add author/narrator
506 if creator:
507 author_list = [creator]
508 audiobook.authors = UniqueList(author_list)
509
510 # Add metadata
511 if description := clean_text(item_metadata.get("description")):
512 audiobook.metadata.description = description
513
514 # Add thumbnail
515 add_item_image(audiobook, prov_audiobook_id, self.instance_id)
516
517 # Calculate duration and chapters
518 try:
519 total_duration, chapters = await self._calculate_audiobook_duration_and_chapters(
520 prov_audiobook_id
521 )
522 audiobook.duration = total_duration
523 if len(chapters) > 1:
524 audiobook.metadata.chapters = chapters
525
526 except Exception as err:
527 self.logger.warning(
528 f"Could not process audio files for audiobook {prov_audiobook_id}: {err}"
529 )
530 audiobook.duration = 0
531 audiobook.metadata.chapters = []
532
533 return audiobook
534
535 async def get_album_tracks(self, prov_album_id: str) -> list[Track]:
536 """Get album tracks for given album id."""
537 metadata = await self._get_metadata(prov_album_id)
538 item_metadata = metadata.get("metadata", {})
539 audio_files = await self._get_audio_files(prov_album_id)
540 tracks = []
541
542 # Pre-create album artist to avoid duplicates
543 album_artist = clean_text(item_metadata.get("creator"))
544 album_artist_normalized = album_artist.lower() if album_artist else ""
545 album_artist_obj = None
546 if album_artist:
547 album_artist_obj = create_artist(album_artist, self.domain, self.instance_id)
548 else:
549 album_artist_obj = create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)
550
551 for i, file_info in enumerate(audio_files, 1):
552 filename = file_info.get("name", "")
553
554 # Use file's title if available, otherwise clean up filename
555 track_name = file_info.get("title", filename)
556 if not track_name or track_name == filename:
557 track_name = filename.rsplit(".", 1)[0] if "." in filename else filename
558
559 # Try to extract track number from file metadata first, then filename
560 track_number = self._extract_track_number(file_info, track_name, i)
561
562 track = Track(
563 item_id=f"{prov_album_id}#{filename}",
564 provider=self.instance_id,
565 name=track_name,
566 track_number=track_number,
567 provider_mappings={
568 ProviderMapping(
569 item_id=f"{prov_album_id}#{filename}",
570 provider_domain=self.domain,
571 provider_instance=self.instance_id,
572 url=self.client.get_download_url(prov_album_id, filename),
573 available=True,
574 )
575 },
576 )
577
578 # Add file-specific artist if available, otherwise use album artist
579 file_artist = file_info.get("artist") or file_info.get("creator")
580 if file_artist:
581 file_artist_cleaned = clean_text(file_artist)
582 file_artist_normalized = file_artist_cleaned.lower()
583 # Check if this is the same as album artist to avoid duplicates (case-insensitive)
584 if album_artist_normalized and file_artist_normalized == album_artist_normalized:
585 track.artists = UniqueList([album_artist_obj])
586 else:
587 track.artists = UniqueList(
588 [create_artist(file_artist_cleaned, self.domain, self.instance_id)]
589 )
590 else:
591 # Use pre-created album artist object
592 track.artists = UniqueList([album_artist_obj])
593
594 # Add duration if available
595 if duration_str := file_info.get("length"):
596 if duration := parse_duration(duration_str):
597 track.duration = duration
598
599 # Add genre if available
600 if genre := file_info.get("genre"):
601 track.metadata.genres = {clean_text(genre)}
602
603 tracks.append(track)
604
605 return tracks
606
607 def _extract_track_number(
608 self, file_info: dict[str, Any], track_name: str, fallback: int
609 ) -> int:
610 """Extract track number from file metadata or filename."""
611 track_number = None
612
613 if "track" in file_info:
614 with contextlib.suppress(ValueError, AttributeError):
615 track_number = int(str(file_info["track"]).split("/")[0])
616
617 if track_number is None:
618 # Fallback to filename parsing
619 track_num_match = re.search(r"^(\d+)[\s\-_.]*(.+)", track_name)
620 track_number = int(track_num_match.group(1)) if track_num_match else fallback
621
622 return track_number
623
624 @use_cache(expiration=86400 * 30) # Cache for 30 days - artist catalogs change infrequently
625 async def get_artist_albums(self, prov_artist_id: str) -> list[Album]:
626 """
627 Get albums for a specific artist.
628
629 Uses metadata heuristics to determine likely albums without expensive
630 API calls for better performance.
631
632 Args:
633 prov_artist_id: Provider-specific artist identifier (artist name)
634
635 Returns:
636 List of Album objects by the artist
637 """
638 albums: list[Album] = []
639 page = 0
640 page_size = 200 # IA's maximum
641
642 while len(albums) < 1000: # Reasonable upper limit
643 search_response = await self._search(
644 query=f'creator:"{prov_artist_id}" AND (format:"VBR MP3" OR format:"FLAC" \
645 OR format:"Ogg Vorbis")',
646 sort="downloads desc",
647 rows=page_size,
648 page=page,
649 )
650
651 docs = search_response.get("response", {}).get("docs", [])
652 if not docs:
653 break
654
655 for doc in docs:
656 try:
657 # Use metadata heuristics instead of expensive API calls
658 # to determine if item is an album
659 if is_likely_album(doc):
660 album = doc_to_album(
661 doc, self.domain, self.instance_id, self.client.get_item_url
662 )
663 if album:
664 albums.append(album)
665 except (KeyError, ValueError, TypeError) as err:
666 self.logger.debug(
667 "Skipping invalid album for artist %s: %s", prov_artist_id, err
668 )
669 continue
670 except (TimeoutError, aiohttp.ClientError) as err:
671 self.logger.debug(
672 "Network error processing album for artist %s: %s", prov_artist_id, err
673 )
674 continue
675 except Exception as err:
676 self.logger.exception(
677 "Unexpected error processing album for artist %s: %s", prov_artist_id, err
678 )
679 continue
680 page += 1
681 return albums
682
683 @use_cache(expiration=86400 * 7) # Cache for 1 week
684 async def get_artist_toptracks(self, prov_artist_id: str) -> list[Track]:
685 """
686 Get top tracks for a specific artist.
687
688 Uses the same search as get_artist_albums but filters for single tracks.
689
690 Args:
691 prov_artist_id: Provider-specific artist identifier (artist name)
692
693 Returns:
694 List of Track objects representing the artist's top tracks
695 """
696 tracks = []
697 search_response = await self._search(
698 query=(
699 f'creator:"{prov_artist_id}" AND '
700 f'(format:"VBR MP3" OR format:"FLAC" OR format:"Ogg Vorbis")'
701 ),
702 rows=25, # Limit for "top" tracks
703 sort="downloads desc",
704 )
705
706 response_data = search_response.get("response", {})
707 docs = response_data.get("docs", [])
708
709 for doc in docs:
710 try:
711 # Only include items that are NOT classified as albums
712 if not is_likely_album(doc):
713 track = doc_to_track(
714 doc, self.domain, self.instance_id, self.client.get_item_url
715 )
716 if track:
717 tracks.append(track)
718 except (KeyError, ValueError, TypeError) as err:
719 self.logger.debug("Skipping invalid track for artist %s: %s", prov_artist_id, err)
720 continue
721 except (TimeoutError, aiohttp.ClientError) as err:
722 self.logger.debug(
723 "Network error processing track for artist %s: %s", prov_artist_id, err
724 )
725 continue
726 except Exception as err:
727 self.logger.exception(
728 "Unexpected error processing track for artist %s: %s", prov_artist_id, err
729 )
730 continue
731
732 if len(tracks) >= 25:
733 break
734
735 return tracks
736
737 async def get_stream_details(self, item_id: str, media_type: MediaType) -> StreamDetails:
738 """
739 Get streamdetails for a track or audiobook.
740
741 Delegates to the streaming handler for proper multi-file support.
742
743 Args:
744 item_id: Provider-specific item identifier
745 media_type: The type of media being requested
746
747 Returns:
748 StreamDetails object configured for the specific item type
749
750 Raises:
751 MediaNotFoundError: If no audio files are found for the item
752 """
753 return await self.streaming.get_stream_details(item_id, media_type)
754
755 async def _calculate_audiobook_duration_and_chapters(
756 self, item_id: str
757 ) -> tuple[int, list[MediaItemChapter]]:
758 """Calculate duration and chapters for audiobooks."""
759 audio_files = await self._get_audio_files(item_id)
760 total_duration = 0
761 chapters = []
762 current_position = 0.0
763
764 for i, file_info in enumerate(audio_files, 1):
765 chapter_duration = parse_duration(file_info.get("length", "0")) or 0
766 total_duration += chapter_duration
767
768 chapter_name = file_info.get("title") or file_info.get("name", f"Chapter {i}")
769 chapter = MediaItemChapter(
770 position=i,
771 name=clean_text(chapter_name),
772 start=current_position,
773 end=current_position + chapter_duration if chapter_duration > 0 else None,
774 )
775 chapters.append(chapter)
776 current_position += chapter_duration
777
778 return total_duration, chapters
779
780 async def get_audio_stream(
781 self, streamdetails: StreamDetails, seek_position: int = 0
782 ) -> AsyncGenerator[bytes, None]:
783 """Get audio stream from Internet Archive."""
784 # Use sock_read=None to allow long audiobook chapters to stream fully
785 timeout = aiohttp.ClientTimeout(sock_read=None, total=None)
786
787 if streamdetails.media_type == MediaType.AUDIOBOOK and isinstance(streamdetails.data, dict):
788 chapter_urls = streamdetails.data.get("chapters", [])
789 chapters_data = streamdetails.data.get("chapters_data", [])
790
791 # Calculate which chapter to start from based on seek_position
792 seek_position_ms = seek_position * 1000
793 start_chapter = 0
794
795 if seek_position > 0 and chapters_data:
796 accumulated_duration_ms = 0
797
798 for i, chapter_data in enumerate(chapters_data):
799 chapter_duration_ms = (
800 parse_duration(chapter_data.get("length", "0")) or 0
801 ) * 1000
802
803 if accumulated_duration_ms + chapter_duration_ms > seek_position_ms:
804 start_chapter = i
805 break
806 accumulated_duration_ms += chapter_duration_ms
807
808 # Stream chapters starting from calculated position
809 chapters_yielded = False
810 for i in range(start_chapter, len(chapter_urls)):
811 chapter_url = chapter_urls[i]
812
813 try:
814 async with self.mass.http_session.get(chapter_url, timeout=timeout) as response:
815 response.raise_for_status()
816 async for chunk in response.content.iter_chunked(8192):
817 chapters_yielded = True
818 yield chunk
819 except Exception as e:
820 self.logger.error(f"Chapter {i + 1} streaming failed: {e}")
821 continue
822
823 # If no chapters succeeded, raise an error instead of silent failure
824 if not chapters_yielded:
825 raise MediaNotFoundError(
826 f"Failed to stream any chapters for audiobook {streamdetails.item_id}"
827 )
828
829 else:
830 # Handle single files
831 audio_files = await self._get_audio_files(streamdetails.item_id)
832 if audio_files:
833 download_url = self.client.get_download_url(
834 streamdetails.item_id, audio_files[0]["name"]
835 )
836 async with self.mass.http_session.get(download_url, timeout=timeout) as response:
837 response.raise_for_status()
838 async for chunk in response.content.iter_chunked(8192):
839 yield chunk
840
841 @use_cache(expiration=86400 * 7) # Cache for 1 week
842 async def get_podcast(self, prov_podcast_id: str) -> Podcast:
843 """Get full podcast details by id."""
844 metadata = await self._get_metadata(prov_podcast_id)
845 item_metadata = metadata.get("metadata", {})
846
847 title = clean_text(item_metadata.get("title"))
848 creator = clean_text(item_metadata.get("creator"))
849
850 if not title:
851 raise MediaNotFoundError(f"Podcast {prov_podcast_id} not found or invalid")
852
853 podcast = Podcast(
854 item_id=prov_podcast_id,
855 provider=self.instance_id,
856 name=title,
857 provider_mappings={
858 create_provider_mapping(
859 prov_podcast_id, self.domain, self.instance_id, self.client.get_item_url
860 )
861 },
862 )
863
864 # Add publisher/creator
865 if creator:
866 podcast.publisher = creator
867
868 # Add metadata
869 if description := clean_text(item_metadata.get("description")):
870 podcast.metadata.description = description
871
872 # Add thumbnail
873 add_item_image(podcast, prov_podcast_id, self.instance_id)
874
875 # Calculate total episodes
876 try:
877 audio_files = await self._get_audio_files(prov_podcast_id)
878 podcast.total_episodes = len(audio_files)
879 except Exception as err:
880 self.logger.warning(f"Could not get episode count for podcast {prov_podcast_id}: {err}")
881 podcast.total_episodes = None
882
883 return podcast
884
885 async def get_podcast_episodes(
886 self, prov_podcast_id: str
887 ) -> AsyncGenerator[PodcastEpisode, None]:
888 """Get podcast episodes for given podcast id."""
889 metadata = await self._get_metadata(prov_podcast_id)
890 item_metadata = metadata.get("metadata", {})
891 audio_files = await self._get_audio_files(prov_podcast_id)
892
893 # Create podcast reference for episodes
894 podcast = Podcast(
895 item_id=prov_podcast_id,
896 provider=self.instance_id,
897 name=clean_text(item_metadata.get("title", prov_podcast_id)),
898 provider_mappings={
899 create_provider_mapping(
900 prov_podcast_id, self.domain, self.instance_id, self.client.get_item_url
901 )
902 },
903 )
904
905 for i, file_info in enumerate(audio_files, 1):
906 filename = file_info.get("name", "")
907
908 # Use file's title if available, otherwise clean up filename
909 episode_name = file_info.get("title", filename)
910 if not episode_name or episode_name == filename:
911 episode_name = filename.rsplit(".", 1)[0] if "." in filename else filename
912
913 # Try to extract episode number from file metadata first, then filename
914 episode_number = self._extract_track_number(file_info, episode_name, i)
915
916 episode = PodcastEpisode(
917 item_id=f"{prov_podcast_id}#{filename}",
918 provider=self.instance_id,
919 name=episode_name,
920 position=episode_number,
921 podcast=podcast,
922 provider_mappings={
923 ProviderMapping(
924 item_id=f"{prov_podcast_id}#{filename}",
925 provider_domain=self.domain,
926 provider_instance=self.instance_id,
927 url=self.client.get_download_url(prov_podcast_id, filename),
928 available=True,
929 )
930 },
931 )
932
933 # Add duration if available
934 if duration_str := file_info.get("length"):
935 if duration := parse_duration(duration_str):
936 episode.duration = duration
937
938 # Add episode metadata
939 if description := file_info.get("description"):
940 episode.metadata.description = clean_text(description)
941
942 yield episode
943
944 async def get_podcast_episode(self, prov_episode_id: str) -> PodcastEpisode:
945 """Get single podcast episode by id."""
946 if "#" not in prov_episode_id:
947 raise MediaNotFoundError(f"Invalid episode ID format: {prov_episode_id}")
948
949 podcast_id, _ = prov_episode_id.split("#", 1)
950
951 async for episode in self.get_podcast_episodes(podcast_id):
952 if episode.item_id == prov_episode_id:
953 return episode
954
955 raise MediaNotFoundError(f"Episode {prov_episode_id} not found")
956