music-assistant-server

36.8 KB•PY

provider.py

36.8 KB • 956 lines • python

1"""Internet Archive music provider implementation."""
2
3from __future__ import annotations
4
5import contextlib
6import re
7from collections.abc import AsyncGenerator
8from typing import TYPE_CHECKING, Any
9
10import aiohttp
11from music_assistant_models.enums import MediaType, ProviderFeature
12from music_assistant_models.errors import InvalidDataError, MediaNotFoundError
13from music_assistant_models.media_items import (
14    Album,
15    Artist,
16    Audiobook,
17    MediaItemChapter,
18    Podcast,
19    PodcastEpisode,
20    ProviderMapping,
21    SearchResults,
22    Track,
23)
24from music_assistant_models.unique_list import UniqueList
25
26from music_assistant.constants import UNKNOWN_ARTIST
27from music_assistant.controllers.cache import use_cache
28from music_assistant.helpers.throttle_retry import ThrottlerManager, throttle_with_retries
29from music_assistant.models.music_provider import MusicProvider
30
31from .helpers import InternetArchiveClient, clean_text, extract_year, parse_duration
32from .parsers import (
33    add_item_image,
34    artist_exists,
35    create_artist,
36    create_provider_mapping,
37    create_title_from_identifier,
38    doc_to_album,
39    doc_to_audiobook,
40    doc_to_podcast,
41    doc_to_track,
42    is_audiobook_content,
43    is_likely_album,
44    is_podcast_content,
45)
46from .streaming import InternetArchiveStreaming
47
48if TYPE_CHECKING:
49    from music_assistant_models.config_entries import ProviderConfig
50    from music_assistant_models.provider import ProviderManifest
51    from music_assistant_models.streamdetails import StreamDetails
52
53    from music_assistant import MusicAssistant
54
55
56class InternetArchiveProvider(MusicProvider):
57    """Implementation of Internet Archive music provider."""
58
59    def __init__(
60        self,
61        mass: MusicAssistant,
62        manifest: ProviderManifest,
63        config: ProviderConfig,
64        supported_features: set[ProviderFeature],
65    ) -> None:
66        """Initialize the provider."""
67        super().__init__(mass, manifest, config, supported_features)
68        self.throttler = ThrottlerManager(
69            rate_limit=10, period=60, retry_attempts=5, initial_backoff=5
70        )
71        self.client = InternetArchiveClient(mass)
72        self.streaming = InternetArchiveStreaming(self)
73
74    @property
75    def is_streaming_provider(self) -> bool:
76        """Return True if provider is a streaming provider."""
77        return True
78
79    @throttle_with_retries
80    async def _get_json(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
81        """Make a GET request and return JSON response with throttling."""
82        return await self.client._get_json(url, params)
83
84    @throttle_with_retries
85    async def _search(self, **kwargs: Any) -> dict[str, Any]:
86        """Throttled search wrapper."""
87        return await self.client.search(**kwargs)
88
89    @throttle_with_retries
90    async def _get_metadata(self, identifier: str) -> dict[str, Any]:
91        """Throttled metadata wrapper."""
92        return await self.client.get_metadata(identifier)
93
94    @throttle_with_retries
95    @use_cache(expiration=86400 * 30)  # 30 days - file listings are static
96    async def _get_audio_files(self, identifier: str) -> list[dict[str, Any]]:
97        """Throttled audio files wrapper."""
98        return await self.client.get_audio_files(identifier)
99
100    @use_cache(86400 * 7)  # 7 days
101    async def search(
102        self,
103        search_query: str,
104        media_types: list[MediaType],
105        limit: int = 5,
106    ) -> SearchResults:
107        """
108        Perform search on Internet Archive.
109
110        Uses multiple search strategies to maximize result coverage with
111        proper result accumulation and broader search patterns.
112
113        Args:
114            search_query: The search term to look for
115            media_types: List of media types to search for
116            limit: Maximum number of results to return per media type
117
118        Returns:
119            SearchResults object containing found items
120        """
121        if not search_query.strip():
122            return SearchResults()
123
124        # Adjust search intensity based on what's being requested
125        rows_per_strategy = min(limit * 2, 16) if len(media_types) > 1 else min(limit * 2, 100)
126
127        # Collect results in separate lists
128        tracks: list[Track] = []
129        albums: list[Album] = []
130        artists: list[Artist] = []
131        audiobooks: list[Audiobook] = []
132        podcasts: list[Podcast] = []
133
134        # Track processed identifiers to avoid duplicates across strategies
135        processed_ids: set[str] = set()
136
137        # Build search strategies based on requested media types
138        search_strategies = []
139
140        # For music searches: focus on title and creator
141        if any(mt in media_types for mt in [MediaType.TRACK, MediaType.ALBUM, MediaType.ARTIST]):
142            search_strategies.extend(
143                [
144                    (f"creator:({search_query}) AND mediatype:audio", "downloads desc"),
145                    (f"title:({search_query}) AND mediatype:audio", "downloads desc"),
146                    (f"subject:({search_query}) AND mediatype:audio", "downloads desc"),
147                ]
148            )
149
150        # For audiobooks: search within audiobook collections, still limit to audio
151        if MediaType.AUDIOBOOK in media_types:
152            audiobook_query = f"{search_query} AND collection:(librivoxaudio OR audio_bookspoetry) AND mediatype:audio"  # noqa: E501
153            search_strategies.append((audiobook_query, "downloads desc"))
154
155        # For podcasts: search within podcast collections
156        if MediaType.PODCAST in media_types:
157            podcast_query = f"{search_query} AND collection:podcasts AND mediatype:audio"
158            search_strategies.append((podcast_query, "downloads desc"))
159
160        for strategy_idx, (strategy_query, sort_order) in enumerate(search_strategies):
161            self.logger.debug("Trying search strategy %d: %s", strategy_idx + 1, strategy_query)
162
163            try:
164                search_response = await self._search(
165                    query=strategy_query,
166                    rows=rows_per_strategy,
167                    sort=sort_order,
168                )
169
170                response_data = search_response.get("response", {})
171                docs = response_data.get("docs", [])
172                self.logger.debug(
173                    "Strategy %d '%s' found %d raw results",
174                    strategy_idx + 1,
175                    strategy_query,
176                    len(docs),
177                )
178
179                # Process results and extract different media types
180                strategy_processed = 0
181                strategy_skipped = 0
182
183                for doc in docs:
184                    try:
185                        identifier = doc.get("identifier")
186                        if not identifier or identifier in processed_ids:
187                            strategy_skipped += 1
188                            continue
189
190                        # Track this identifier to avoid duplicates
191                        processed_ids.add(identifier)
192
193                        await self._process_search_result(
194                            doc, tracks, albums, artists, audiobooks, podcasts, media_types
195                        )
196                        strategy_processed += 1
197
198                        # Check if we have enough results across all types
199                        if self._has_sufficient_results(
200                            tracks, albums, artists, audiobooks, podcasts, media_types, limit
201                        ):
202                            self.logger.debug(
203                                "Sufficient results found after strategy %d, stopping search",
204                                strategy_idx + 1,
205                            )
206                            break
207
208                    except (InvalidDataError, KeyError) as err:
209                        self.logger.debug("Skipping invalid search result: %s", err)
210                        strategy_skipped += 1
211                        continue
212
213                self.logger.debug(
214                    "Strategy %d '%s': processed %d new items, skipped %d items. "
215                    "Running totals - tracks: %d, albums: %d, artists: %d, "
216                    "audiobooks: %d, podcasts: %d",
217                    strategy_idx + 1,
218                    strategy_query,
219                    strategy_processed,
220                    strategy_skipped,
221                    len(tracks),
222                    len(albums),
223                    len(artists),
224                    len(audiobooks),
225                    len(podcasts),
226                )
227
228                # If we have sufficient results, stop trying more strategies
229                if self._has_sufficient_results(
230                    tracks, albums, artists, audiobooks, podcasts, media_types, limit
231                ):
232                    break
233
234            except Exception as err:
235                self.logger.warning("Search strategy %d failed: %s", strategy_idx + 1, err)
236                continue
237
238        # Log final results for debugging
239        self.logger.debug(
240            "Search for '%s' completed. Final results - tracks: %d, albums: %d, "
241            "artists: %d, audiobooks: %d, podcasts: %d (processed %d unique items)",
242            search_query,
243            len(tracks),
244            len(albums),
245            len(artists),
246            len(audiobooks),
247            len(podcasts),
248            len(processed_ids),
249        )
250
251        return SearchResults(
252            tracks=tracks[:limit] if MediaType.TRACK in media_types else [],
253            albums=albums[:limit] if MediaType.ALBUM in media_types else [],
254            artists=artists[:limit] if MediaType.ARTIST in media_types else [],
255            audiobooks=audiobooks[:limit] if MediaType.AUDIOBOOK in media_types else [],
256            podcasts=podcasts[:limit] if MediaType.PODCAST in media_types else [],
257        )
258
259    def _has_sufficient_results(
260        self,
261        tracks: list[Track],
262        albums: list[Album],
263        artists: list[Artist],
264        audiobooks: list[Audiobook],
265        podcasts: list[Podcast],
266        media_types: list[MediaType],
267        limit: int,
268    ) -> bool:
269        """Check if we have sufficient results for all requested media types."""
270        return (
271            (MediaType.TRACK not in media_types or len(tracks) >= limit)
272            and (MediaType.ALBUM not in media_types or len(albums) >= limit)
273            and (MediaType.ARTIST not in media_types or len(artists) >= limit)
274            and (MediaType.AUDIOBOOK not in media_types or len(audiobooks) >= limit)
275            and (MediaType.PODCAST not in media_types or len(podcasts) >= limit)
276        )
277
278    async def _process_search_result(
279        self,
280        doc: dict[str, Any],
281        tracks: list[Track],
282        albums: list[Album],
283        artists: list[Artist],
284        audiobooks: list[Audiobook],
285        podcasts: list[Podcast],
286        media_types: list[MediaType],
287    ) -> None:
288        """
289        Process a single search result document from Internet Archive.
290
291        Determines the appropriate media type and creates corresponding objects.
292        Uses improved heuristics to classify items as tracks, albums, or audiobooks.
293        """
294        identifier = doc.get("identifier")
295        if not identifier:
296            raise InvalidDataError("Missing identifier in search result")
297
298        title = clean_text(doc.get("title"))
299        creator = clean_text(doc.get("creator"))
300
301        # Be lenient - allow items without title if they have identifier
302        if not title and not identifier:
303            raise InvalidDataError("Missing both title and identifier in search result")
304
305        # Use identifier as fallback title if needed
306        if not title:
307            title = create_title_from_identifier(identifier)
308
309        # Determine what type of item this is
310        mediatype = doc.get("mediatype", "")
311        collection = doc.get("collection", [])
312        if isinstance(collection, str):
313            collection = [collection]
314
315        # Check if this is audiobook content using improved detection
316        if is_audiobook_content(doc) and MediaType.AUDIOBOOK in media_types:
317            audiobook = doc_to_audiobook(
318                doc, self.domain, self.instance_id, self.client.get_item_url
319            )
320            if audiobook:
321                audiobooks.append(audiobook)
322            return  # Don't process as other media types
323
324        # Check if this is podcast content
325        if is_podcast_content(doc) and MediaType.PODCAST in media_types:
326            podcast = doc_to_podcast(doc, self.domain, self.instance_id, self.client.get_item_url)
327            if podcast:
328                podcasts.append(podcast)
329            return  # Don't process as other media types
330
331        # For etree items, usually each item is an album (concert)
332        if mediatype == "etree" or "etree" in collection:
333            if MediaType.ALBUM in media_types:
334                album = doc_to_album(doc, self.domain, self.instance_id, self.client.get_item_url)
335                if album:
336                    albums.append(album)
337
338            if MediaType.ARTIST in media_types and creator:
339                artist = create_artist(creator, self.domain, self.instance_id)
340                if artist and not artist_exists(artist, artists):
341                    artists.append(artist)
342
343        elif mediatype == "audio":
344            # Use heuristics to determine album vs track without expensive API calls
345            if is_likely_album(doc):
346                if MediaType.ALBUM in media_types:
347                    album = doc_to_album(
348                        doc, self.domain, self.instance_id, self.client.get_item_url
349                    )
350                    if album:
351                        albums.append(album)
352            elif MediaType.TRACK in media_types:
353                track = doc_to_track(doc, self.domain, self.instance_id, self.client.get_item_url)
354                if track:
355                    tracks.append(track)
356
357            if MediaType.ARTIST in media_types and creator:
358                artist = create_artist(creator, self.domain, self.instance_id)
359                if artist and not artist_exists(artist, artists):
360                    artists.append(artist)
361
362    @use_cache(expiration=86400 * 60)  # Cache for 60 days - artist "tracks" change infrequently
363    async def get_track(self, prov_track_id: str) -> Track:
364        """Get full track details by id."""
365        metadata = await self._get_metadata(prov_track_id)
366        item_metadata = metadata.get("metadata", {})
367
368        title = clean_text(item_metadata.get("title"))
369        creator = clean_text(item_metadata.get("creator"))
370
371        if not title:
372            raise MediaNotFoundError(f"Track {prov_track_id} not found or invalid")
373
374        track = Track(
375            item_id=prov_track_id,
376            provider=self.instance_id,
377            name=title,
378            provider_mappings={
379                create_provider_mapping(
380                    prov_track_id, self.domain, self.instance_id, self.client.get_item_url
381                )
382            },
383        )
384
385        # Add artist
386        if creator:
387            track.artists = UniqueList([create_artist(creator, self.domain, self.instance_id)])
388        else:
389            track.artists = UniqueList(
390                [create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)]
391            )
392
393        # Add duration from first audio file
394        try:
395            audio_files = await self._get_audio_files(prov_track_id)
396            if audio_files and audio_files[0].get("length"):
397                duration = parse_duration(audio_files[0]["length"])
398                if duration:
399                    track.duration = duration
400        except (TimeoutError, aiohttp.ClientError) as err:
401            self.logger.debug("Network error getting duration for track %s: %s", prov_track_id, err)
402        except (KeyError, ValueError, TypeError) as err:
403            self.logger.debug("Could not parse duration for track %s: %s", prov_track_id, err)
404
405        # Add metadata
406        if description := clean_text(item_metadata.get("description")):
407            track.metadata.description = description
408
409        # Add thumbnail
410        add_item_image(track, prov_track_id, self.instance_id)
411
412        return track
413
414    @use_cache(expiration=86400 * 60)  # Cache for 60 days - album catalogs change infrequently
415    async def get_album(self, prov_album_id: str) -> Album:
416        """Get full album details by id."""
417        metadata = await self._get_metadata(prov_album_id)
418        item_metadata = metadata.get("metadata", {})
419
420        title = clean_text(item_metadata.get("title"))
421        creator = clean_text(item_metadata.get("creator"))
422
423        if not title:
424            raise MediaNotFoundError(f"Album {prov_album_id} not found or invalid")
425
426        album = Album(
427            item_id=prov_album_id,
428            provider=self.instance_id,
429            name=title,
430            provider_mappings={
431                create_provider_mapping(
432                    prov_album_id, self.domain, self.instance_id, self.client.get_item_url
433                )
434            },
435        )
436
437        # Add artist
438        if creator:
439            album.artists = UniqueList([create_artist(creator, self.domain, self.instance_id)])
440        else:
441            album.artists = UniqueList(
442                [create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)]
443            )
444
445        # Add metadata
446        if date := extract_year(item_metadata.get("date")):
447            album.year = date
448
449        if description := clean_text(item_metadata.get("description")):
450            album.metadata.description = description
451
452        # Add thumbnail
453        add_item_image(album, prov_album_id, self.instance_id)
454
455        return album
456
457    @use_cache(expiration=86400 * 60)  # Cache for 60 days - artist catalogs change infrequently
458    async def get_artist(self, prov_artist_id: str) -> Artist:
459        """
460        Get full artist details by id.
461
462        Args:
463            prov_artist_id: Provider-specific artist identifier (artist name)
464
465        Returns:
466            Artist object
467        """
468        # Artist IDs are just the creator names
469        return Artist(
470            item_id=prov_artist_id,
471            provider=self.instance_id,
472            name=prov_artist_id,
473            provider_mappings={
474                ProviderMapping(
475                    item_id=prov_artist_id,
476                    provider_domain=self.domain,
477                    provider_instance=self.instance_id,
478                )
479            },
480        )
481
482    @use_cache(expiration=86400 * 30)  # Cache for 30 days - audiobook catalogs change infrequently
483    async def get_audiobook(self, prov_audiobook_id: str) -> Audiobook:
484        """Get full audiobook details by id."""
485        metadata = await self._get_metadata(prov_audiobook_id)
486        item_metadata = metadata.get("metadata", {})
487
488        title = clean_text(item_metadata.get("title"))
489        creator = clean_text(item_metadata.get("creator"))
490
491        if not title:
492            raise MediaNotFoundError(f"Audiobook {prov_audiobook_id} not found or invalid")
493
494        audiobook = Audiobook(
495            item_id=prov_audiobook_id,
496            provider=self.instance_id,
497            name=title,
498            provider_mappings={
499                create_provider_mapping(
500                    prov_audiobook_id, self.domain, self.instance_id, self.client.get_item_url
501                )
502            },
503        )
504
505        # Add author/narrator
506        if creator:
507            author_list = [creator]
508            audiobook.authors = UniqueList(author_list)
509
510        # Add metadata
511        if description := clean_text(item_metadata.get("description")):
512            audiobook.metadata.description = description
513
514        # Add thumbnail
515        add_item_image(audiobook, prov_audiobook_id, self.instance_id)
516
517        # Calculate duration and chapters
518        try:
519            total_duration, chapters = await self._calculate_audiobook_duration_and_chapters(
520                prov_audiobook_id
521            )
522            audiobook.duration = total_duration
523            if len(chapters) > 1:
524                audiobook.metadata.chapters = chapters
525
526        except Exception as err:
527            self.logger.warning(
528                f"Could not process audio files for audiobook {prov_audiobook_id}: {err}"
529            )
530            audiobook.duration = 0
531            audiobook.metadata.chapters = []
532
533        return audiobook
534
535    async def get_album_tracks(self, prov_album_id: str) -> list[Track]:
536        """Get album tracks for given album id."""
537        metadata = await self._get_metadata(prov_album_id)
538        item_metadata = metadata.get("metadata", {})
539        audio_files = await self._get_audio_files(prov_album_id)
540        tracks = []
541
542        # Pre-create album artist to avoid duplicates
543        album_artist = clean_text(item_metadata.get("creator"))
544        album_artist_normalized = album_artist.lower() if album_artist else ""
545        album_artist_obj = None
546        if album_artist:
547            album_artist_obj = create_artist(album_artist, self.domain, self.instance_id)
548        else:
549            album_artist_obj = create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)
550
551        for i, file_info in enumerate(audio_files, 1):
552            filename = file_info.get("name", "")
553
554            # Use file's title if available, otherwise clean up filename
555            track_name = file_info.get("title", filename)
556            if not track_name or track_name == filename:
557                track_name = filename.rsplit(".", 1)[0] if "." in filename else filename
558
559            # Try to extract track number from file metadata first, then filename
560            track_number = self._extract_track_number(file_info, track_name, i)
561
562            track = Track(
563                item_id=f"{prov_album_id}#{filename}",
564                provider=self.instance_id,
565                name=track_name,
566                track_number=track_number,
567                provider_mappings={
568                    ProviderMapping(
569                        item_id=f"{prov_album_id}#{filename}",
570                        provider_domain=self.domain,
571                        provider_instance=self.instance_id,
572                        url=self.client.get_download_url(prov_album_id, filename),
573                        available=True,
574                    )
575                },
576            )
577
578            # Add file-specific artist if available, otherwise use album artist
579            file_artist = file_info.get("artist") or file_info.get("creator")
580            if file_artist:
581                file_artist_cleaned = clean_text(file_artist)
582                file_artist_normalized = file_artist_cleaned.lower()
583                # Check if this is the same as album artist to avoid duplicates (case-insensitive)
584                if album_artist_normalized and file_artist_normalized == album_artist_normalized:
585                    track.artists = UniqueList([album_artist_obj])
586                else:
587                    track.artists = UniqueList(
588                        [create_artist(file_artist_cleaned, self.domain, self.instance_id)]
589                    )
590            else:
591                # Use pre-created album artist object
592                track.artists = UniqueList([album_artist_obj])
593
594            # Add duration if available
595            if duration_str := file_info.get("length"):
596                if duration := parse_duration(duration_str):
597                    track.duration = duration
598
599            # Add genre if available
600            if genre := file_info.get("genre"):
601                track.metadata.genres = {clean_text(genre)}
602
603            tracks.append(track)
604
605        return tracks
606
607    def _extract_track_number(
608        self, file_info: dict[str, Any], track_name: str, fallback: int
609    ) -> int:
610        """Extract track number from file metadata or filename."""
611        track_number = None
612
613        if "track" in file_info:
614            with contextlib.suppress(ValueError, AttributeError):
615                track_number = int(str(file_info["track"]).split("/")[0])
616
617        if track_number is None:
618            # Fallback to filename parsing
619            track_num_match = re.search(r"^(\d+)[\s\-_.]*(.+)", track_name)
620            track_number = int(track_num_match.group(1)) if track_num_match else fallback
621
622        return track_number
623
624    @use_cache(expiration=86400 * 30)  # Cache for 30 days - artist catalogs change infrequently
625    async def get_artist_albums(self, prov_artist_id: str) -> list[Album]:
626        """
627        Get albums for a specific artist.
628
629        Uses metadata heuristics to determine likely albums without expensive
630        API calls for better performance.
631
632        Args:
633            prov_artist_id: Provider-specific artist identifier (artist name)
634
635        Returns:
636            List of Album objects by the artist
637        """
638        albums: list[Album] = []
639        page = 0
640        page_size = 200  # IA's maximum
641
642        while len(albums) < 1000:  # Reasonable upper limit
643            search_response = await self._search(
644                query=f'creator:"{prov_artist_id}" AND (format:"VBR MP3" OR format:"FLAC" \
645        OR format:"Ogg Vorbis")',
646                sort="downloads desc",
647                rows=page_size,
648                page=page,
649            )
650
651            docs = search_response.get("response", {}).get("docs", [])
652            if not docs:
653                break
654
655            for doc in docs:
656                try:
657                    # Use metadata heuristics instead of expensive API calls
658                    # to determine if item is an album
659                    if is_likely_album(doc):
660                        album = doc_to_album(
661                            doc, self.domain, self.instance_id, self.client.get_item_url
662                        )
663                        if album:
664                            albums.append(album)
665                except (KeyError, ValueError, TypeError) as err:
666                    self.logger.debug(
667                        "Skipping invalid album for artist %s: %s", prov_artist_id, err
668                    )
669                    continue
670                except (TimeoutError, aiohttp.ClientError) as err:
671                    self.logger.debug(
672                        "Network error processing album for artist %s: %s", prov_artist_id, err
673                    )
674                    continue
675                except Exception as err:
676                    self.logger.exception(
677                        "Unexpected error processing album for artist %s: %s", prov_artist_id, err
678                    )
679                    continue
680            page += 1
681        return albums
682
683    @use_cache(expiration=86400 * 7)  # Cache for 1 week
684    async def get_artist_toptracks(self, prov_artist_id: str) -> list[Track]:
685        """
686        Get top tracks for a specific artist.
687
688        Uses the same search as get_artist_albums but filters for single tracks.
689
690        Args:
691            prov_artist_id: Provider-specific artist identifier (artist name)
692
693        Returns:
694            List of Track objects representing the artist's top tracks
695        """
696        tracks = []
697        search_response = await self._search(
698            query=(
699                f'creator:"{prov_artist_id}" AND '
700                f'(format:"VBR MP3" OR format:"FLAC" OR format:"Ogg Vorbis")'
701            ),
702            rows=25,  # Limit for "top" tracks
703            sort="downloads desc",
704        )
705
706        response_data = search_response.get("response", {})
707        docs = response_data.get("docs", [])
708
709        for doc in docs:
710            try:
711                # Only include items that are NOT classified as albums
712                if not is_likely_album(doc):
713                    track = doc_to_track(
714                        doc, self.domain, self.instance_id, self.client.get_item_url
715                    )
716                    if track:
717                        tracks.append(track)
718            except (KeyError, ValueError, TypeError) as err:
719                self.logger.debug("Skipping invalid track for artist %s: %s", prov_artist_id, err)
720                continue
721            except (TimeoutError, aiohttp.ClientError) as err:
722                self.logger.debug(
723                    "Network error processing track for artist %s: %s", prov_artist_id, err
724                )
725                continue
726            except Exception as err:
727                self.logger.exception(
728                    "Unexpected error processing track for artist %s: %s", prov_artist_id, err
729                )
730                continue
731
732            if len(tracks) >= 25:
733                break
734
735        return tracks
736
737    async def get_stream_details(self, item_id: str, media_type: MediaType) -> StreamDetails:
738        """
739        Get streamdetails for a track or audiobook.
740
741        Delegates to the streaming handler for proper multi-file support.
742
743        Args:
744            item_id: Provider-specific item identifier
745            media_type: The type of media being requested
746
747        Returns:
748            StreamDetails object configured for the specific item type
749
750        Raises:
751            MediaNotFoundError: If no audio files are found for the item
752        """
753        return await self.streaming.get_stream_details(item_id, media_type)
754
755    async def _calculate_audiobook_duration_and_chapters(
756        self, item_id: str
757    ) -> tuple[int, list[MediaItemChapter]]:
758        """Calculate duration and chapters for audiobooks."""
759        audio_files = await self._get_audio_files(item_id)
760        total_duration = 0
761        chapters = []
762        current_position = 0.0
763
764        for i, file_info in enumerate(audio_files, 1):
765            chapter_duration = parse_duration(file_info.get("length", "0")) or 0
766            total_duration += chapter_duration
767
768            chapter_name = file_info.get("title") or file_info.get("name", f"Chapter {i}")
769            chapter = MediaItemChapter(
770                position=i,
771                name=clean_text(chapter_name),
772                start=current_position,
773                end=current_position + chapter_duration if chapter_duration > 0 else None,
774            )
775            chapters.append(chapter)
776            current_position += chapter_duration
777
778        return total_duration, chapters
779
780    async def get_audio_stream(
781        self, streamdetails: StreamDetails, seek_position: int = 0
782    ) -> AsyncGenerator[bytes, None]:
783        """Get audio stream from Internet Archive."""
784        # Use sock_read=None to allow long audiobook chapters to stream fully
785        timeout = aiohttp.ClientTimeout(sock_read=None, total=None)
786
787        if streamdetails.media_type == MediaType.AUDIOBOOK and isinstance(streamdetails.data, dict):
788            chapter_urls = streamdetails.data.get("chapters", [])
789            chapters_data = streamdetails.data.get("chapters_data", [])
790
791            # Calculate which chapter to start from based on seek_position
792            seek_position_ms = seek_position * 1000
793            start_chapter = 0
794
795            if seek_position > 0 and chapters_data:
796                accumulated_duration_ms = 0
797
798                for i, chapter_data in enumerate(chapters_data):
799                    chapter_duration_ms = (
800                        parse_duration(chapter_data.get("length", "0")) or 0
801                    ) * 1000
802
803                    if accumulated_duration_ms + chapter_duration_ms > seek_position_ms:
804                        start_chapter = i
805                        break
806                    accumulated_duration_ms += chapter_duration_ms
807
808            # Stream chapters starting from calculated position
809            chapters_yielded = False
810            for i in range(start_chapter, len(chapter_urls)):
811                chapter_url = chapter_urls[i]
812
813                try:
814                    async with self.mass.http_session.get(chapter_url, timeout=timeout) as response:
815                        response.raise_for_status()
816                        async for chunk in response.content.iter_chunked(8192):
817                            chapters_yielded = True
818                            yield chunk
819                except Exception as e:
820                    self.logger.error(f"Chapter {i + 1} streaming failed: {e}")
821                    continue
822
823            # If no chapters succeeded, raise an error instead of silent failure
824            if not chapters_yielded:
825                raise MediaNotFoundError(
826                    f"Failed to stream any chapters for audiobook {streamdetails.item_id}"
827                )
828
829        else:
830            # Handle single files
831            audio_files = await self._get_audio_files(streamdetails.item_id)
832            if audio_files:
833                download_url = self.client.get_download_url(
834                    streamdetails.item_id, audio_files[0]["name"]
835                )
836                async with self.mass.http_session.get(download_url, timeout=timeout) as response:
837                    response.raise_for_status()
838                    async for chunk in response.content.iter_chunked(8192):
839                        yield chunk
840
841    @use_cache(expiration=86400 * 7)  # Cache for 1 week
842    async def get_podcast(self, prov_podcast_id: str) -> Podcast:
843        """Get full podcast details by id."""
844        metadata = await self._get_metadata(prov_podcast_id)
845        item_metadata = metadata.get("metadata", {})
846
847        title = clean_text(item_metadata.get("title"))
848        creator = clean_text(item_metadata.get("creator"))
849
850        if not title:
851            raise MediaNotFoundError(f"Podcast {prov_podcast_id} not found or invalid")
852
853        podcast = Podcast(
854            item_id=prov_podcast_id,
855            provider=self.instance_id,
856            name=title,
857            provider_mappings={
858                create_provider_mapping(
859                    prov_podcast_id, self.domain, self.instance_id, self.client.get_item_url
860                )
861            },
862        )
863
864        # Add publisher/creator
865        if creator:
866            podcast.publisher = creator
867
868        # Add metadata
869        if description := clean_text(item_metadata.get("description")):
870            podcast.metadata.description = description
871
872        # Add thumbnail
873        add_item_image(podcast, prov_podcast_id, self.instance_id)
874
875        # Calculate total episodes
876        try:
877            audio_files = await self._get_audio_files(prov_podcast_id)
878            podcast.total_episodes = len(audio_files)
879        except Exception as err:
880            self.logger.warning(f"Could not get episode count for podcast {prov_podcast_id}: {err}")
881            podcast.total_episodes = None
882
883        return podcast
884
885    async def get_podcast_episodes(
886        self, prov_podcast_id: str
887    ) -> AsyncGenerator[PodcastEpisode, None]:
888        """Get podcast episodes for given podcast id."""
889        metadata = await self._get_metadata(prov_podcast_id)
890        item_metadata = metadata.get("metadata", {})
891        audio_files = await self._get_audio_files(prov_podcast_id)
892
893        # Create podcast reference for episodes
894        podcast = Podcast(
895            item_id=prov_podcast_id,
896            provider=self.instance_id,
897            name=clean_text(item_metadata.get("title", prov_podcast_id)),
898            provider_mappings={
899                create_provider_mapping(
900                    prov_podcast_id, self.domain, self.instance_id, self.client.get_item_url
901                )
902            },
903        )
904
905        for i, file_info in enumerate(audio_files, 1):
906            filename = file_info.get("name", "")
907
908            # Use file's title if available, otherwise clean up filename
909            episode_name = file_info.get("title", filename)
910            if not episode_name or episode_name == filename:
911                episode_name = filename.rsplit(".", 1)[0] if "." in filename else filename
912
913            # Try to extract episode number from file metadata first, then filename
914            episode_number = self._extract_track_number(file_info, episode_name, i)
915
916            episode = PodcastEpisode(
917                item_id=f"{prov_podcast_id}#{filename}",
918                provider=self.instance_id,
919                name=episode_name,
920                position=episode_number,
921                podcast=podcast,
922                provider_mappings={
923                    ProviderMapping(
924                        item_id=f"{prov_podcast_id}#{filename}",
925                        provider_domain=self.domain,
926                        provider_instance=self.instance_id,
927                        url=self.client.get_download_url(prov_podcast_id, filename),
928                        available=True,
929                    )
930                },
931            )
932
933            # Add duration if available
934            if duration_str := file_info.get("length"):
935                if duration := parse_duration(duration_str):
936                    episode.duration = duration
937
938            # Add episode metadata
939            if description := file_info.get("description"):
940                episode.metadata.description = clean_text(description)
941
942            yield episode
943
944    async def get_podcast_episode(self, prov_episode_id: str) -> PodcastEpisode:
945        """Get single podcast episode by id."""
946        if "#" not in prov_episode_id:
947            raise MediaNotFoundError(f"Invalid episode ID format: {prov_episode_id}")
948
949        podcast_id, _ = prov_episode_id.split("#", 1)
950
951        async for episode in self.get_podcast_episodes(podcast_id):
952            if episode.item_id == prov_episode_id:
953                return episode
954
955        raise MediaNotFoundError(f"Episode {prov_episode_id} not found")
956

1"""Internet Archive music provider implementation.""" 2 3from __future__ import annotations 4 5import contextlib 6import re 7from collections.abc import AsyncGenerator 8from typing import TYPE_CHECKING, Any 9 10import aiohttp 11from music_assistant_models.enums import MediaType, ProviderFeature 12from music_assistant_models.errors import InvalidDataError, MediaNotFoundError 13from music_assistant_models.media_items import ( 14 Album, 15 Artist, 16 Audiobook, 17 MediaItemChapter, 18 Podcast, 19 PodcastEpisode, 20 ProviderMapping, 21 SearchResults, 22 Track, 23) 24from music_assistant_models.unique_list import UniqueList 25 26from music_assistant.constants import UNKNOWN_ARTIST 27from music_assistant.controllers.cache import use_cache 28from music_assistant.helpers.throttle_retry import ThrottlerManager, throttle_with_retries 29from music_assistant.models.music_provider import MusicProvider 30 31from .helpers import InternetArchiveClient, clean_text, extract_year, parse_duration 32from .parsers import ( 33 add_item_image, 34 artist_exists, 35 create_artist, 36 create_provider_mapping, 37 create_title_from_identifier, 38 doc_to_album, 39 doc_to_audiobook, 40 doc_to_podcast, 41 doc_to_track, 42 is_audiobook_content, 43 is_likely_album, 44 is_podcast_content, 45) 46from .streaming import InternetArchiveStreaming 47 48if TYPE_CHECKING: 49 from music_assistant_models.config_entries import ProviderConfig 50 from music_assistant_models.provider import ProviderManifest 51 from music_assistant_models.streamdetails import StreamDetails 52 53 from music_assistant import MusicAssistant 54 55 56class InternetArchiveProvider(MusicProvider): 57 """Implementation of Internet Archive music provider.""" 58 59 def __init__( 60 self, 61 mass: MusicAssistant, 62 manifest: ProviderManifest, 63 config: ProviderConfig, 64 supported_features: set[ProviderFeature], 65 ) -> None: 66 """Initialize the provider.""" 67 super().__init__(mass, manifest, config, supported_features) 68 self.throttler = ThrottlerManager( 69 rate_limit=10, period=60, retry_attempts=5, initial_backoff=5 70 ) 71 self.client = InternetArchiveClient(mass) 72 self.streaming = InternetArchiveStreaming(self) 73 74 @property 75 def is_streaming_provider(self) -> bool: 76 """Return True if provider is a streaming provider.""" 77 return True 78 79 @throttle_with_retries 80 async def _get_json(self, url: str, params: dict[str, Any] | None = None) -> dict[str, Any]: 81 """Make a GET request and return JSON response with throttling.""" 82 return await self.client._get_json(url, params) 83 84 @throttle_with_retries 85 async def _search(self, **kwargs: Any) -> dict[str, Any]: 86 """Throttled search wrapper.""" 87 return await self.client.search(**kwargs) 88 89 @throttle_with_retries 90 async def _get_metadata(self, identifier: str) -> dict[str, Any]: 91 """Throttled metadata wrapper.""" 92 return await self.client.get_metadata(identifier) 93 94 @throttle_with_retries 95 @use_cache(expiration=86400 * 30) # 30 days - file listings are static 96 async def _get_audio_files(self, identifier: str) -> list[dict[str, Any]]: 97 """Throttled audio files wrapper.""" 98 return await self.client.get_audio_files(identifier) 99 100 @use_cache(86400 * 7) # 7 days 101 async def search( 102 self, 103 search_query: str, 104 media_types: list[MediaType], 105 limit: int = 5, 106 ) -> SearchResults: 107 """ 108 Perform search on Internet Archive. 109 110 Uses multiple search strategies to maximize result coverage with 111 proper result accumulation and broader search patterns. 112 113 Args: 114 search_query: The search term to look for 115 media_types: List of media types to search for 116 limit: Maximum number of results to return per media type 117 118 Returns: 119 SearchResults object containing found items 120 """ 121 if not search_query.strip(): 122 return SearchResults() 123 124 # Adjust search intensity based on what's being requested 125 rows_per_strategy = min(limit * 2, 16) if len(media_types) > 1 else min(limit * 2, 100) 126 127 # Collect results in separate lists 128 tracks: list[Track] = [] 129 albums: list[Album] = [] 130 artists: list[Artist] = [] 131 audiobooks: list[Audiobook] = [] 132 podcasts: list[Podcast] = [] 133 134 # Track processed identifiers to avoid duplicates across strategies 135 processed_ids: set[str] = set() 136 137 # Build search strategies based on requested media types 138 search_strategies = [] 139 140 # For music searches: focus on title and creator 141 if any(mt in media_types for mt in [MediaType.TRACK, MediaType.ALBUM, MediaType.ARTIST]): 142 search_strategies.extend( 143 [ 144 (f"creator:({search_query}) AND mediatype:audio", "downloads desc"), 145 (f"title:({search_query}) AND mediatype:audio", "downloads desc"), 146 (f"subject:({search_query}) AND mediatype:audio", "downloads desc"), 147 ] 148 ) 149 150 # For audiobooks: search within audiobook collections, still limit to audio 151 if MediaType.AUDIOBOOK in media_types: 152 audiobook_query = f"{search_query} AND collection:(librivoxaudio OR audio_bookspoetry) AND mediatype:audio" # noqa: E501 153 search_strategies.append((audiobook_query, "downloads desc")) 154 155 # For podcasts: search within podcast collections 156 if MediaType.PODCAST in media_types: 157 podcast_query = f"{search_query} AND collection:podcasts AND mediatype:audio" 158 search_strategies.append((podcast_query, "downloads desc")) 159 160 for strategy_idx, (strategy_query, sort_order) in enumerate(search_strategies): 161 self.logger.debug("Trying search strategy %d: %s", strategy_idx + 1, strategy_query) 162 163 try: 164 search_response = await self._search( 165 query=strategy_query, 166 rows=rows_per_strategy, 167 sort=sort_order, 168 ) 169 170 response_data = search_response.get("response", {}) 171 docs = response_data.get("docs", []) 172 self.logger.debug( 173 "Strategy %d '%s' found %d raw results", 174 strategy_idx + 1, 175 strategy_query, 176 len(docs), 177 ) 178 179 # Process results and extract different media types 180 strategy_processed = 0 181 strategy_skipped = 0 182 183 for doc in docs: 184 try: 185 identifier = doc.get("identifier") 186 if not identifier or identifier in processed_ids: 187 strategy_skipped += 1 188 continue 189 190 # Track this identifier to avoid duplicates 191 processed_ids.add(identifier) 192 193 await self._process_search_result( 194 doc, tracks, albums, artists, audiobooks, podcasts, media_types 195 ) 196 strategy_processed += 1 197 198 # Check if we have enough results across all types 199 if self._has_sufficient_results( 200 tracks, albums, artists, audiobooks, podcasts, media_types, limit 201 ): 202 self.logger.debug( 203 "Sufficient results found after strategy %d, stopping search", 204 strategy_idx + 1, 205 ) 206 break 207 208 except (InvalidDataError, KeyError) as err: 209 self.logger.debug("Skipping invalid search result: %s", err) 210 strategy_skipped += 1 211 continue 212 213 self.logger.debug( 214 "Strategy %d '%s': processed %d new items, skipped %d items. " 215 "Running totals - tracks: %d, albums: %d, artists: %d, " 216 "audiobooks: %d, podcasts: %d", 217 strategy_idx + 1, 218 strategy_query, 219 strategy_processed, 220 strategy_skipped, 221 len(tracks), 222 len(albums), 223 len(artists), 224 len(audiobooks), 225 len(podcasts), 226 ) 227 228 # If we have sufficient results, stop trying more strategies 229 if self._has_sufficient_results( 230 tracks, albums, artists, audiobooks, podcasts, media_types, limit 231 ): 232 break 233 234 except Exception as err: 235 self.logger.warning("Search strategy %d failed: %s", strategy_idx + 1, err) 236 continue 237 238 # Log final results for debugging 239 self.logger.debug( 240 "Search for '%s' completed. Final results - tracks: %d, albums: %d, " 241 "artists: %d, audiobooks: %d, podcasts: %d (processed %d unique items)", 242 search_query, 243 len(tracks), 244 len(albums), 245 len(artists), 246 len(audiobooks), 247 len(podcasts), 248 len(processed_ids), 249 ) 250 251 return SearchResults( 252 tracks=tracks[:limit] if MediaType.TRACK in media_types else [], 253 albums=albums[:limit] if MediaType.ALBUM in media_types else [], 254 artists=artists[:limit] if MediaType.ARTIST in media_types else [], 255 audiobooks=audiobooks[:limit] if MediaType.AUDIOBOOK in media_types else [], 256 podcasts=podcasts[:limit] if MediaType.PODCAST in media_types else [], 257 ) 258 259 def _has_sufficient_results( 260 self, 261 tracks: list[Track], 262 albums: list[Album], 263 artists: list[Artist], 264 audiobooks: list[Audiobook], 265 podcasts: list[Podcast], 266 media_types: list[MediaType], 267 limit: int, 268 ) -> bool: 269 """Check if we have sufficient results for all requested media types.""" 270 return ( 271 (MediaType.TRACK not in media_types or len(tracks) >= limit) 272 and (MediaType.ALBUM not in media_types or len(albums) >= limit) 273 and (MediaType.ARTIST not in media_types or len(artists) >= limit) 274 and (MediaType.AUDIOBOOK not in media_types or len(audiobooks) >= limit) 275 and (MediaType.PODCAST not in media_types or len(podcasts) >= limit) 276 ) 277 278 async def _process_search_result( 279 self, 280 doc: dict[str, Any], 281 tracks: list[Track], 282 albums: list[Album], 283 artists: list[Artist], 284 audiobooks: list[Audiobook], 285 podcasts: list[Podcast], 286 media_types: list[MediaType], 287 ) -> None: 288 """ 289 Process a single search result document from Internet Archive. 290 291 Determines the appropriate media type and creates corresponding objects. 292 Uses improved heuristics to classify items as tracks, albums, or audiobooks. 293 """ 294 identifier = doc.get("identifier") 295 if not identifier: 296 raise InvalidDataError("Missing identifier in search result") 297 298 title = clean_text(doc.get("title")) 299 creator = clean_text(doc.get("creator")) 300 301 # Be lenient - allow items without title if they have identifier 302 if not title and not identifier: 303 raise InvalidDataError("Missing both title and identifier in search result") 304 305 # Use identifier as fallback title if needed 306 if not title: 307 title = create_title_from_identifier(identifier) 308 309 # Determine what type of item this is 310 mediatype = doc.get("mediatype", "") 311 collection = doc.get("collection", []) 312 if isinstance(collection, str): 313 collection = [collection] 314 315 # Check if this is audiobook content using improved detection 316 if is_audiobook_content(doc) and MediaType.AUDIOBOOK in media_types: 317 audiobook = doc_to_audiobook( 318 doc, self.domain, self.instance_id, self.client.get_item_url 319 ) 320 if audiobook: 321 audiobooks.append(audiobook) 322 return # Don't process as other media types 323 324 # Check if this is podcast content 325 if is_podcast_content(doc) and MediaType.PODCAST in media_types: 326 podcast = doc_to_podcast(doc, self.domain, self.instance_id, self.client.get_item_url) 327 if podcast: 328 podcasts.append(podcast) 329 return # Don't process as other media types 330 331 # For etree items, usually each item is an album (concert) 332 if mediatype == "etree" or "etree" in collection: 333 if MediaType.ALBUM in media_types: 334 album = doc_to_album(doc, self.domain, self.instance_id, self.client.get_item_url) 335 if album: 336 albums.append(album) 337 338 if MediaType.ARTIST in media_types and creator: 339 artist = create_artist(creator, self.domain, self.instance_id) 340 if artist and not artist_exists(artist, artists): 341 artists.append(artist) 342 343 elif mediatype == "audio": 344 # Use heuristics to determine album vs track without expensive API calls 345 if is_likely_album(doc): 346 if MediaType.ALBUM in media_types: 347 album = doc_to_album( 348 doc, self.domain, self.instance_id, self.client.get_item_url 349 ) 350 if album: 351 albums.append(album) 352 elif MediaType.TRACK in media_types: 353 track = doc_to_track(doc, self.domain, self.instance_id, self.client.get_item_url) 354 if track: 355 tracks.append(track) 356 357 if MediaType.ARTIST in media_types and creator: 358 artist = create_artist(creator, self.domain, self.instance_id) 359 if artist and not artist_exists(artist, artists): 360 artists.append(artist) 361 362 @use_cache(expiration=86400 * 60) # Cache for 60 days - artist "tracks" change infrequently 363 async def get_track(self, prov_track_id: str) -> Track: 364 """Get full track details by id.""" 365 metadata = await self._get_metadata(prov_track_id) 366 item_metadata = metadata.get("metadata", {}) 367 368 title = clean_text(item_metadata.get("title")) 369 creator = clean_text(item_metadata.get("creator")) 370 371 if not title: 372 raise MediaNotFoundError(f"Track {prov_track_id} not found or invalid") 373 374 track = Track( 375 item_id=prov_track_id, 376 provider=self.instance_id, 377 name=title, 378 provider_mappings={ 379 create_provider_mapping( 380 prov_track_id, self.domain, self.instance_id, self.client.get_item_url 381 ) 382 }, 383 ) 384 385 # Add artist 386 if creator: 387 track.artists = UniqueList([create_artist(creator, self.domain, self.instance_id)]) 388 else: 389 track.artists = UniqueList( 390 [create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)] 391 ) 392 393 # Add duration from first audio file 394 try: 395 audio_files = await self._get_audio_files(prov_track_id) 396 if audio_files and audio_files[0].get("length"): 397 duration = parse_duration(audio_files[0]["length"]) 398 if duration: 399 track.duration = duration 400 except (TimeoutError, aiohttp.ClientError) as err: 401 self.logger.debug("Network error getting duration for track %s: %s", prov_track_id, err) 402 except (KeyError, ValueError, TypeError) as err: 403 self.logger.debug("Could not parse duration for track %s: %s", prov_track_id, err) 404 405 # Add metadata 406 if description := clean_text(item_metadata.get("description")): 407 track.metadata.description = description 408 409 # Add thumbnail 410 add_item_image(track, prov_track_id, self.instance_id) 411 412 return track 413 414 @use_cache(expiration=86400 * 60) # Cache for 60 days - album catalogs change infrequently 415 async def get_album(self, prov_album_id: str) -> Album: 416 """Get full album details by id.""" 417 metadata = await self._get_metadata(prov_album_id) 418 item_metadata = metadata.get("metadata", {}) 419 420 title = clean_text(item_metadata.get("title")) 421 creator = clean_text(item_metadata.get("creator")) 422 423 if not title: 424 raise MediaNotFoundError(f"Album {prov_album_id} not found or invalid") 425 426 album = Album( 427 item_id=prov_album_id, 428 provider=self.instance_id, 429 name=title, 430 provider_mappings={ 431 create_provider_mapping( 432 prov_album_id, self.domain, self.instance_id, self.client.get_item_url 433 ) 434 }, 435 ) 436 437 # Add artist 438 if creator: 439 album.artists = UniqueList([create_artist(creator, self.domain, self.instance_id)]) 440 else: 441 album.artists = UniqueList( 442 [create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id)] 443 ) 444 445 # Add metadata 446 if date := extract_year(item_metadata.get("date")): 447 album.year = date 448 449 if description := clean_text(item_metadata.get("description")): 450 album.metadata.description = description 451 452 # Add thumbnail 453 add_item_image(album, prov_album_id, self.instance_id) 454 455 return album 456 457 @use_cache(expiration=86400 * 60) # Cache for 60 days - artist catalogs change infrequently 458 async def get_artist(self, prov_artist_id: str) -> Artist: 459 """ 460 Get full artist details by id. 461 462 Args: 463 prov_artist_id: Provider-specific artist identifier (artist name) 464 465 Returns: 466 Artist object 467 """ 468 # Artist IDs are just the creator names 469 return Artist( 470 item_id=prov_artist_id, 471 provider=self.instance_id, 472 name=prov_artist_id, 473 provider_mappings={ 474 ProviderMapping( 475 item_id=prov_artist_id, 476 provider_domain=self.domain, 477 provider_instance=self.instance_id, 478 ) 479 }, 480 ) 481 482 @use_cache(expiration=86400 * 30) # Cache for 30 days - audiobook catalogs change infrequently 483 async def get_audiobook(self, prov_audiobook_id: str) -> Audiobook: 484 """Get full audiobook details by id.""" 485 metadata = await self._get_metadata(prov_audiobook_id) 486 item_metadata = metadata.get("metadata", {}) 487 488 title = clean_text(item_metadata.get("title")) 489 creator = clean_text(item_metadata.get("creator")) 490 491 if not title: 492 raise MediaNotFoundError(f"Audiobook {prov_audiobook_id} not found or invalid") 493 494 audiobook = Audiobook( 495 item_id=prov_audiobook_id, 496 provider=self.instance_id, 497 name=title, 498 provider_mappings={ 499 create_provider_mapping( 500 prov_audiobook_id, self.domain, self.instance_id, self.client.get_item_url 501 ) 502 }, 503 ) 504 505 # Add author/narrator 506 if creator: 507 author_list = [creator] 508 audiobook.authors = UniqueList(author_list) 509 510 # Add metadata 511 if description := clean_text(item_metadata.get("description")): 512 audiobook.metadata.description = description 513 514 # Add thumbnail 515 add_item_image(audiobook, prov_audiobook_id, self.instance_id) 516 517 # Calculate duration and chapters 518 try: 519 total_duration, chapters = await self._calculate_audiobook_duration_and_chapters( 520 prov_audiobook_id 521 ) 522 audiobook.duration = total_duration 523 if len(chapters) > 1: 524 audiobook.metadata.chapters = chapters 525 526 except Exception as err: 527 self.logger.warning( 528 f"Could not process audio files for audiobook {prov_audiobook_id}: {err}" 529 ) 530 audiobook.duration = 0 531 audiobook.metadata.chapters = [] 532 533 return audiobook 534 535 async def get_album_tracks(self, prov_album_id: str) -> list[Track]: 536 """Get album tracks for given album id.""" 537 metadata = await self._get_metadata(prov_album_id) 538 item_metadata = metadata.get("metadata", {}) 539 audio_files = await self._get_audio_files(prov_album_id) 540 tracks = [] 541 542 # Pre-create album artist to avoid duplicates 543 album_artist = clean_text(item_metadata.get("creator")) 544 album_artist_normalized = album_artist.lower() if album_artist else "" 545 album_artist_obj = None 546 if album_artist: 547 album_artist_obj = create_artist(album_artist, self.domain, self.instance_id) 548 else: 549 album_artist_obj = create_artist(UNKNOWN_ARTIST, self.domain, self.instance_id) 550 551 for i, file_info in enumerate(audio_files, 1): 552 filename = file_info.get("name", "") 553 554 # Use file's title if available, otherwise clean up filename 555 track_name = file_info.get("title", filename) 556 if not track_name or track_name == filename: 557 track_name = filename.rsplit(".", 1)[0] if "." in filename else filename 558 559 # Try to extract track number from file metadata first, then filename 560 track_number = self._extract_track_number(file_info, track_name, i) 561 562 track = Track( 563 item_id=f"{prov_album_id}#{filename}", 564 provider=self.instance_id, 565 name=track_name, 566 track_number=track_number, 567 provider_mappings={ 568 ProviderMapping( 569 item_id=f"{prov_album_id}#{filename}", 570 provider_domain=self.domain, 571 provider_instance=self.instance_id, 572 url=self.client.get_download_url(prov_album_id, filename), 573 available=True, 574 ) 575 }, 576 ) 577 578 # Add file-specific artist if available, otherwise use album artist 579 file_artist = file_info.get("artist") or file_info.get("creator") 580 if file_artist: 581 file_artist_cleaned = clean_text(file_artist) 582 file_artist_normalized = file_artist_cleaned.lower() 583 # Check if this is the same as album artist to avoid duplicates (case-insensitive) 584 if album_artist_normalized and file_artist_normalized == album_artist_normalized: 585 track.artists = UniqueList([album_artist_obj]) 586 else: 587 track.artists = UniqueList( 588 [create_artist(file_artist_cleaned, self.domain, self.instance_id)] 589 ) 590 else: 591 # Use pre-created album artist object 592 track.artists = UniqueList([album_artist_obj]) 593 594 # Add duration if available 595 if duration_str := file_info.get("length"): 596 if duration := parse_duration(duration_str): 597 track.duration = duration 598 599 # Add genre if available 600 if genre := file_info.get("genre"): 601 track.metadata.genres = {clean_text(genre)} 602 603 tracks.append(track) 604 605 return tracks 606 607 def _extract_track_number( 608 self, file_info: dict[str, Any], track_name: str, fallback: int 609 ) -> int: 610 """Extract track number from file metadata or filename.""" 611 track_number = None 612 613 if "track" in file_info: 614 with contextlib.suppress(ValueError, AttributeError): 615 track_number = int(str(file_info["track"]).split("/")[0]) 616 617 if track_number is None: 618 # Fallback to filename parsing 619 track_num_match = re.search(r"^(\d+)[\s\-_.]*(.+)", track_name) 620 track_number = int(track_num_match.group(1)) if track_num_match else fallback 621 622 return track_number 623 624 @use_cache(expiration=86400 * 30) # Cache for 30 days - artist catalogs change infrequently 625 async def get_artist_albums(self, prov_artist_id: str) -> list[Album]: 626 """ 627 Get albums for a specific artist. 628 629 Uses metadata heuristics to determine likely albums without expensive 630 API calls for better performance. 631 632 Args: 633 prov_artist_id: Provider-specific artist identifier (artist name) 634 635 Returns: 636 List of Album objects by the artist 637 """ 638 albums: list[Album] = [] 639 page = 0 640 page_size = 200 # IA's maximum 641 642 while len(albums) < 1000: # Reasonable upper limit 643 search_response = await self._search( 644 query=f'creator:"{prov_artist_id}" AND (format:"VBR MP3" OR format:"FLAC" \ 645 OR format:"Ogg Vorbis")', 646 sort="downloads desc", 647 rows=page_size, 648 page=page, 649 ) 650 651 docs = search_response.get("response", {}).get("docs", []) 652 if not docs: 653 break 654 655 for doc in docs: 656 try: 657 # Use metadata heuristics instead of expensive API calls 658 # to determine if item is an album 659 if is_likely_album(doc): 660 album = doc_to_album( 661 doc, self.domain, self.instance_id, self.client.get_item_url 662 ) 663 if album: 664 albums.append(album) 665 except (KeyError, ValueError, TypeError) as err: 666 self.logger.debug( 667 "Skipping invalid album for artist %s: %s", prov_artist_id, err 668 ) 669 continue 670 except (TimeoutError, aiohttp.ClientError) as err: 671 self.logger.debug( 672 "Network error processing album for artist %s: %s", prov_artist_id, err 673 ) 674 continue 675 except Exception as err: 676 self.logger.exception( 677 "Unexpected error processing album for artist %s: %s", prov_artist_id, err 678 ) 679 continue 680 page += 1 681 return albums 682 683 @use_cache(expiration=86400 * 7) # Cache for 1 week 684 async def get_artist_toptracks(self, prov_artist_id: str) -> list[Track]: 685 """ 686 Get top tracks for a specific artist. 687 688 Uses the same search as get_artist_albums but filters for single tracks. 689 690 Args: 691 prov_artist_id: Provider-specific artist identifier (artist name) 692 693 Returns: 694 List of Track objects representing the artist's top tracks 695 """ 696 tracks = [] 697 search_response = await self._search( 698 query=( 699 f'creator:"{prov_artist_id}" AND ' 700 f'(format:"VBR MP3" OR format:"FLAC" OR format:"Ogg Vorbis")' 701 ), 702 rows=25, # Limit for "top" tracks 703 sort="downloads desc", 704 ) 705 706 response_data = search_response.get("response", {}) 707 docs = response_data.get("docs", []) 708 709 for doc in docs: 710 try: 711 # Only include items that are NOT classified as albums 712 if not is_likely_album(doc): 713 track = doc_to_track( 714 doc, self.domain, self.instance_id, self.client.get_item_url 715 ) 716 if track: 717 tracks.append(track) 718 except (KeyError, ValueError, TypeError) as err: 719 self.logger.debug("Skipping invalid track for artist %s: %s", prov_artist_id, err) 720 continue 721 except (TimeoutError, aiohttp.ClientError) as err: 722 self.logger.debug( 723 "Network error processing track for artist %s: %s", prov_artist_id, err 724 ) 725 continue 726 except Exception as err: 727 self.logger.exception( 728 "Unexpected error processing track for artist %s: %s", prov_artist_id, err 729 ) 730 continue 731 732 if len(tracks) >= 25: 733 break 734 735 return tracks 736 737 async def get_stream_details(self, item_id: str, media_type: MediaType) -> StreamDetails: 738 """ 739 Get streamdetails for a track or audiobook. 740 741 Delegates to the streaming handler for proper multi-file support. 742 743 Args: 744 item_id: Provider-specific item identifier 745 media_type: The type of media being requested 746 747 Returns: 748 StreamDetails object configured for the specific item type 749 750 Raises: 751 MediaNotFoundError: If no audio files are found for the item 752 """ 753 return await self.streaming.get_stream_details(item_id, media_type) 754 755 async def _calculate_audiobook_duration_and_chapters( 756 self, item_id: str 757 ) -> tuple[int, list[MediaItemChapter]]: 758 """Calculate duration and chapters for audiobooks.""" 759 audio_files = await self._get_audio_files(item_id) 760 total_duration = 0 761 chapters = [] 762 current_position = 0.0 763 764 for i, file_info in enumerate(audio_files, 1): 765 chapter_duration = parse_duration(file_info.get("length", "0")) or 0 766 total_duration += chapter_duration 767 768 chapter_name = file_info.get("title") or file_info.get("name", f"Chapter {i}") 769 chapter = MediaItemChapter( 770 position=i, 771 name=clean_text(chapter_name), 772 start=current_position, 773 end=current_position + chapter_duration if chapter_duration > 0 else None, 774 ) 775 chapters.append(chapter) 776 current_position += chapter_duration 777 778 return total_duration, chapters 779 780 async def get_audio_stream( 781 self, streamdetails: StreamDetails, seek_position: int = 0 782 ) -> AsyncGenerator[bytes, None]: 783 """Get audio stream from Internet Archive.""" 784 # Use sock_read=None to allow long audiobook chapters to stream fully 785 timeout = aiohttp.ClientTimeout(sock_read=None, total=None) 786 787 if streamdetails.media_type == MediaType.AUDIOBOOK and isinstance(streamdetails.data, dict): 788 chapter_urls = streamdetails.data.get("chapters", []) 789 chapters_data = streamdetails.data.get("chapters_data", []) 790 791 # Calculate which chapter to start from based on seek_position 792 seek_position_ms = seek_position * 1000 793 start_chapter = 0 794 795 if seek_position > 0 and chapters_data: 796 accumulated_duration_ms = 0 797 798 for i, chapter_data in enumerate(chapters_data): 799 chapter_duration_ms = ( 800 parse_duration(chapter_data.get("length", "0")) or 0 801 ) * 1000 802 803 if accumulated_duration_ms + chapter_duration_ms > seek_position_ms: 804 start_chapter = i 805 break 806 accumulated_duration_ms += chapter_duration_ms 807 808 # Stream chapters starting from calculated position 809 chapters_yielded = False 810 for i in range(start_chapter, len(chapter_urls)): 811 chapter_url = chapter_urls[i] 812 813 try: 814 async with self.mass.http_session.get(chapter_url, timeout=timeout) as response: 815 response.raise_for_status() 816 async for chunk in response.content.iter_chunked(8192): 817 chapters_yielded = True 818 yield chunk 819 except Exception as e: 820 self.logger.error(f"Chapter {i + 1} streaming failed: {e}") 821 continue 822 823 # If no chapters succeeded, raise an error instead of silent failure 824 if not chapters_yielded: 825 raise MediaNotFoundError( 826 f"Failed to stream any chapters for audiobook {streamdetails.item_id}" 827 ) 828 829 else: 830 # Handle single files 831 audio_files = await self._get_audio_files(streamdetails.item_id) 832 if audio_files: 833 download_url = self.client.get_download_url( 834 streamdetails.item_id, audio_files[0]["name"] 835 ) 836 async with self.mass.http_session.get(download_url, timeout=timeout) as response: 837 response.raise_for_status() 838 async for chunk in response.content.iter_chunked(8192): 839 yield chunk 840 841 @use_cache(expiration=86400 * 7) # Cache for 1 week 842 async def get_podcast(self, prov_podcast_id: str) -> Podcast: 843 """Get full podcast details by id.""" 844 metadata = await self._get_metadata(prov_podcast_id) 845 item_metadata = metadata.get("metadata", {}) 846 847 title = clean_text(item_metadata.get("title")) 848 creator = clean_text(item_metadata.get("creator")) 849 850 if not title: 851 raise MediaNotFoundError(f"Podcast {prov_podcast_id} not found or invalid") 852 853 podcast = Podcast( 854 item_id=prov_podcast_id, 855 provider=self.instance_id, 856 name=title, 857 provider_mappings={ 858 create_provider_mapping( 859 prov_podcast_id, self.domain, self.instance_id, self.client.get_item_url 860 ) 861 }, 862 ) 863 864 # Add publisher/creator 865 if creator: 866 podcast.publisher = creator 867 868 # Add metadata 869 if description := clean_text(item_metadata.get("description")): 870 podcast.metadata.description = description 871 872 # Add thumbnail 873 add_item_image(podcast, prov_podcast_id, self.instance_id) 874 875 # Calculate total episodes 876 try: 877 audio_files = await self._get_audio_files(prov_podcast_id) 878 podcast.total_episodes = len(audio_files) 879 except Exception as err: 880 self.logger.warning(f"Could not get episode count for podcast {prov_podcast_id}: {err}") 881 podcast.total_episodes = None 882 883 return podcast 884 885 async def get_podcast_episodes( 886 self, prov_podcast_id: str 887 ) -> AsyncGenerator[PodcastEpisode, None]: 888 """Get podcast episodes for given podcast id.""" 889 metadata = await self._get_metadata(prov_podcast_id) 890 item_metadata = metadata.get("metadata", {}) 891 audio_files = await self._get_audio_files(prov_podcast_id) 892 893 # Create podcast reference for episodes 894 podcast = Podcast( 895 item_id=prov_podcast_id, 896 provider=self.instance_id, 897 name=clean_text(item_metadata.get("title", prov_podcast_id)), 898 provider_mappings={ 899 create_provider_mapping( 900 prov_podcast_id, self.domain, self.instance_id, self.client.get_item_url 901 ) 902 }, 903 ) 904 905 for i, file_info in enumerate(audio_files, 1): 906 filename = file_info.get("name", "") 907 908 # Use file's title if available, otherwise clean up filename 909 episode_name = file_info.get("title", filename) 910 if not episode_name or episode_name == filename: 911 episode_name = filename.rsplit(".", 1)[0] if "." in filename else filename 912 913 # Try to extract episode number from file metadata first, then filename 914 episode_number = self._extract_track_number(file_info, episode_name, i) 915 916 episode = PodcastEpisode( 917 item_id=f"{prov_podcast_id}#{filename}", 918 provider=self.instance_id, 919 name=episode_name, 920 position=episode_number, 921 podcast=podcast, 922 provider_mappings={ 923 ProviderMapping( 924 item_id=f"{prov_podcast_id}#{filename}", 925 provider_domain=self.domain, 926 provider_instance=self.instance_id, 927 url=self.client.get_download_url(prov_podcast_id, filename), 928 available=True, 929 ) 930 }, 931 ) 932 933 # Add duration if available 934 if duration_str := file_info.get("length"): 935 if duration := parse_duration(duration_str): 936 episode.duration = duration 937 938 # Add episode metadata 939 if description := file_info.get("description"): 940 episode.metadata.description = clean_text(description) 941 942 yield episode 943 944 async def get_podcast_episode(self, prov_episode_id: str) -> PodcastEpisode: 945 """Get single podcast episode by id.""" 946 if "#" not in prov_episode_id: 947 raise MediaNotFoundError(f"Invalid episode ID format: {prov_episode_id}") 948 949 podcast_id, _ = prov_episode_id.split("#", 1) 950 951 async for episode in self.get_podcast_episodes(podcast_id): 952 if episode.item_id == prov_episode_id: 953 return episode 954 955 raise MediaNotFoundError(f"Episode {prov_episode_id} not found") 956