music-assistant-server

11.3 KBPY
parsers.py
11.3 KB402 lines • python
1"""Metadata parsing utilities for the Internet Archive provider."""
2
3from __future__ import annotations
4
5import re
6from collections.abc import Callable
7from typing import Any
8
9from music_assistant_models.enums import AlbumType, ImageType
10from music_assistant_models.media_items import (
11    Album,
12    Artist,
13    Audiobook,
14    MediaItemImage,
15    Podcast,
16    ProviderMapping,
17    Track,
18)
19from music_assistant_models.unique_list import UniqueList
20
21from .constants import AUDIOBOOK_COLLECTIONS
22from .helpers import clean_text, extract_year, get_image_url
23
24
25def is_likely_album(doc: dict[str, Any]) -> bool:
26    """
27    Determine if an Internet Archive item is likely an album using metadata heuristics.
28
29    Uses collection types, media types, title analysis, and file count hints to classify items
30    without making expensive API calls to check individual file counts.
31
32    Args:
33        doc: Internet Archive document metadata
34
35    Returns:
36        True if the item is likely an album, False if likely a single track
37    """
38    mediatype = doc.get("mediatype", "")
39    collection = doc.get("collection", [])
40    title = clean_text(doc.get("title", "")).lower()
41
42    if isinstance(collection, str):
43        collection = [collection]
44
45    # etree collection items are almost always live concert albums
46    if "etree" in collection:
47        return True
48
49    # Skip obvious audiobook/speech collections - these are handled separately
50    if any(coll in AUDIOBOOK_COLLECTIONS for coll in collection):
51        return False
52
53    # Check for hints in the metadata that suggest multiple files
54    # Some IA items include file count information
55    if "files" in doc:
56        # If we have file info and it's more than 2-3 files, likely an album
57        # (accounting for derivative files like thumbnails)
58        try:
59            file_count = len(doc["files"]) if isinstance(doc["files"], list) else 0
60            if file_count > 3:  # More than just 1-2 audio files + derivatives
61                return True
62        except (TypeError, KeyError):
63            pass
64
65    # Use title keywords to identify likely albums vs singles
66    album_indicators = [
67        "album",
68        "live",
69        "concert",
70        "session",
71        "collection",
72        "compilation",
73        "complete",
74        "anthology",
75        "best of",
76        "greatest hits",
77        "discography",
78        "vol ",
79        "volume",
80        "part ",
81        "disc ",
82        "cd ",
83        "lp ",
84    ]
85
86    single_indicators = [
87        "single",
88        "track",
89        "song",
90        "remix",
91        "edit",
92        "version",
93        "demo",
94        "instrumental",
95        "acoustic version",
96    ]
97
98    # Strong album indicators in title
99    if any(indicator in title for indicator in album_indicators):
100        return True
101
102    # Strong single indicators in title
103    if any(indicator in title for indicator in single_indicators):
104        return False
105
106    # Collection-specific logic
107    if "netlabels" in collection:
108        # Netlabel releases are usually albums/EPs
109        return True
110
111    if "78rpm" in collection:
112        # 78 RPM records are usually single tracks (A-side/B-side)
113        return False
114
115    if "oldtimeradio" in collection:
116        # Radio shows are usually single episodes, treat as tracks
117        return False
118
119    if "audio_music" in collection:
120        # General music uploads - check for multi-track indicators in title
121        multi_track_indicators = ["ep", "album", "mixtape", "playlist"]
122        return any(indicator in title for indicator in multi_track_indicators)
123
124    # For unknown collections with audio mediatype, be conservative
125    # Default to single track unless we have strong evidence of multiple tracks
126    if mediatype == "audio":
127        # Look for numbering that suggests multiple parts/tracks
128        if re.search(r"\b(track|part|chapter)\s*\d+", title):
129            return True  # Likely part of a larger work
130        return bool(re.search(r"\b\d+\s*of\s*\d+\b", title))
131
132    return False
133
134
135def doc_to_audiobook(
136    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
137) -> Audiobook | None:
138    """
139    Convert Internet Archive document to Audiobook object.
140
141    Args:
142        doc: Internet Archive document metadata
143        domain: Provider domain
144        instance_id: Provider instance identifier
145        item_url_func: Function to generate item URLs
146
147    Returns:
148        Audiobook object or None if conversion fails
149    """
150    identifier = doc.get("identifier")
151    title = clean_text(doc.get("title"))
152    creator = clean_text(doc.get("creator"))
153
154    if not identifier or not title:
155        return None
156
157    audiobook = Audiobook(
158        item_id=identifier,
159        provider=instance_id,
160        name=title,
161        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
162    )
163
164    # Add author/narrator
165    if creator:
166        audiobook.authors.append(creator)
167
168    # Add metadata
169    if description := clean_text(doc.get("description")):
170        audiobook.metadata.description = description
171
172    # Add thumbnail
173    add_item_image(audiobook, identifier, instance_id)
174
175    return audiobook
176
177
178def doc_to_track(
179    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
180) -> Track | None:
181    """
182    Convert Internet Archive document to Track object.
183
184    Args:
185        doc: Internet Archive document metadata
186        domain: Provider domain
187        instance_id: Provider instance identifier
188        item_url_func: Function to generate item URLs
189
190    Returns:
191        Track object or None if conversion fails
192    """
193    identifier = doc.get("identifier")
194    title = clean_text(doc.get("title"))
195    creator = clean_text(doc.get("creator"))
196
197    if not identifier or not title:
198        return None
199
200    track = Track(
201        item_id=identifier,
202        provider=instance_id,
203        name=title,
204        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
205    )
206
207    # Add artist if available
208    if creator:
209        track.artists = UniqueList([create_artist(creator, domain, instance_id)])
210
211    # Add thumbnail
212    add_item_image(track, identifier, instance_id)
213
214    return track
215
216
217def doc_to_album(
218    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
219) -> Album | None:
220    """
221    Convert Internet Archive document to Album object.
222
223    Args:
224        doc: Internet Archive document metadata
225        domain: Provider domain
226        instance_id: Provider instance identifier
227        item_url_func: Function to generate item URLs
228
229    Returns:
230        Album object or None if conversion fails
231    """
232    identifier = doc.get("identifier")
233    title = clean_text(doc.get("title"))
234    creator = clean_text(doc.get("creator"))
235
236    if not identifier or not title:
237        return None
238
239    album = Album(
240        item_id=identifier,
241        provider=instance_id,
242        name=title,
243        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
244    )
245
246    # Add artist if available
247    if creator:
248        album.artists = UniqueList([create_artist(creator, domain, instance_id)])
249
250    # Add metadata
251    if date := extract_year(doc.get("date")):
252        album.year = date
253
254    if description := clean_text(doc.get("description")):
255        album.metadata.description = description
256
257    # Add thumbnail
258    add_item_image(album, identifier, instance_id)
259
260    # Add album type
261    album.album_type = AlbumType.ALBUM
262
263    return album
264
265
266def doc_to_artist(creator_name: str, domain: str, instance_id: str) -> Artist:
267    """Convert creator name to Artist object."""
268    return create_artist(creator_name, domain, instance_id)
269
270
271def create_title_from_identifier(identifier: str) -> str:
272    """Create a human-readable title from an Internet Archive identifier."""
273    return identifier.replace("_", " ").replace("-", " ").title()
274
275
276def artist_exists(artist: Artist, artists: list[Artist]) -> bool:
277    """Check if an artist already exists in the list to avoid duplicates."""
278    return any(existing.name == artist.name for existing in artists)
279
280
281def create_provider_mapping(
282    identifier: str, domain: str, instance_id: str, item_url_func: Callable[[str], str]
283) -> ProviderMapping:
284    """Create a standardized provider mapping for an item."""
285    return ProviderMapping(
286        item_id=identifier,
287        provider_domain=domain,
288        provider_instance=instance_id,
289        url=item_url_func(identifier),
290        available=True,
291    )
292
293
294def create_artist(creator_name: str, domain: str, instance_id: str) -> Artist:
295    """Create an Artist object from creator name."""
296    return Artist(
297        item_id=creator_name,
298        provider=instance_id,
299        name=creator_name,
300        provider_mappings={
301            ProviderMapping(
302                item_id=creator_name,
303                provider_domain=domain,
304                provider_instance=instance_id,
305            )
306        },
307    )
308
309
310def add_item_image(
311    item: Track | Album | Audiobook | Podcast, identifier: str, instance_id: str
312) -> None:
313    """Add thumbnail image to a media item if available."""
314    if thumb_url := get_image_url(identifier):
315        item.metadata.add_image(
316            MediaItemImage(
317                type=ImageType.THUMB,
318                path=thumb_url,
319                provider=instance_id,
320                remotely_accessible=True,
321            )
322        )
323
324
325def is_audiobook_content(doc: dict[str, Any]) -> bool:
326    """
327    Determine if an Internet Archive item is audiobook content.
328
329    Checks if the item is from a known audiobook collection.
330
331    Args:
332        doc: Internet Archive document metadata
333
334    Returns:
335        True if the item is from a known audiobook collection
336    """
337    collection = doc.get("collection", [])
338    if isinstance(collection, str):
339        collection = [collection]
340
341    return any(coll in AUDIOBOOK_COLLECTIONS for coll in collection)
342
343
344def doc_to_podcast(
345    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
346) -> Podcast | None:
347    """
348    Convert Internet Archive document to Podcast object.
349
350    Args:
351        doc: Internet Archive document metadata
352        domain: Provider domain
353        instance_id: Provider instance identifier
354        item_url_func: Function to generate item URLs
355
356    Returns:
357        Podcast object or None if conversion fails
358    """
359    identifier = doc.get("identifier")
360    title = clean_text(doc.get("title"))
361    creator = clean_text(doc.get("creator"))
362
363    if not identifier or not title:
364        return None
365
366    podcast = Podcast(
367        item_id=identifier,
368        provider=instance_id,
369        name=title,
370        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
371    )
372
373    # Add publisher/creator
374    if creator:
375        podcast.publisher = creator
376
377    # Add metadata
378    if description := clean_text(doc.get("description")):
379        podcast.metadata.description = description
380
381    # Add thumbnail
382    add_item_image(podcast, identifier, instance_id)
383
384    return podcast
385
386
387def is_podcast_content(doc: dict[str, Any]) -> bool:
388    """
389    Determine if an Internet Archive item is podcast content.
390
391    Args:
392        doc: Internet Archive document metadata
393
394    Returns:
395        True if the item is from a podcast collection
396    """
397    collection = doc.get("collection", [])
398    if isinstance(collection, str):
399        collection = [collection]
400
401    return "podcasts" in collection
402