music-assistant-server

11.3 KB•PY

parsers.py

11.3 KB • 402 lines • python

1"""Metadata parsing utilities for the Internet Archive provider."""
2
3from __future__ import annotations
4
5import re
6from collections.abc import Callable
7from typing import Any
8
9from music_assistant_models.enums import AlbumType, ImageType
10from music_assistant_models.media_items import (
11    Album,
12    Artist,
13    Audiobook,
14    MediaItemImage,
15    Podcast,
16    ProviderMapping,
17    Track,
18)
19from music_assistant_models.unique_list import UniqueList
20
21from .constants import AUDIOBOOK_COLLECTIONS
22from .helpers import clean_text, extract_year, get_image_url
23
24
25def is_likely_album(doc: dict[str, Any]) -> bool:
26    """
27    Determine if an Internet Archive item is likely an album using metadata heuristics.
28
29    Uses collection types, media types, title analysis, and file count hints to classify items
30    without making expensive API calls to check individual file counts.
31
32    Args:
33        doc: Internet Archive document metadata
34
35    Returns:
36        True if the item is likely an album, False if likely a single track
37    """
38    mediatype = doc.get("mediatype", "")
39    collection = doc.get("collection", [])
40    title = clean_text(doc.get("title", "")).lower()
41
42    if isinstance(collection, str):
43        collection = [collection]
44
45    # etree collection items are almost always live concert albums
46    if "etree" in collection:
47        return True
48
49    # Skip obvious audiobook/speech collections - these are handled separately
50    if any(coll in AUDIOBOOK_COLLECTIONS for coll in collection):
51        return False
52
53    # Check for hints in the metadata that suggest multiple files
54    # Some IA items include file count information
55    if "files" in doc:
56        # If we have file info and it's more than 2-3 files, likely an album
57        # (accounting for derivative files like thumbnails)
58        try:
59            file_count = len(doc["files"]) if isinstance(doc["files"], list) else 0
60            if file_count > 3:  # More than just 1-2 audio files + derivatives
61                return True
62        except (TypeError, KeyError):
63            pass
64
65    # Use title keywords to identify likely albums vs singles
66    album_indicators = [
67        "album",
68        "live",
69        "concert",
70        "session",
71        "collection",
72        "compilation",
73        "complete",
74        "anthology",
75        "best of",
76        "greatest hits",
77        "discography",
78        "vol ",
79        "volume",
80        "part ",
81        "disc ",
82        "cd ",
83        "lp ",
84    ]
85
86    single_indicators = [
87        "single",
88        "track",
89        "song",
90        "remix",
91        "edit",
92        "version",
93        "demo",
94        "instrumental",
95        "acoustic version",
96    ]
97
98    # Strong album indicators in title
99    if any(indicator in title for indicator in album_indicators):
100        return True
101
102    # Strong single indicators in title
103    if any(indicator in title for indicator in single_indicators):
104        return False
105
106    # Collection-specific logic
107    if "netlabels" in collection:
108        # Netlabel releases are usually albums/EPs
109        return True
110
111    if "78rpm" in collection:
112        # 78 RPM records are usually single tracks (A-side/B-side)
113        return False
114
115    if "oldtimeradio" in collection:
116        # Radio shows are usually single episodes, treat as tracks
117        return False
118
119    if "audio_music" in collection:
120        # General music uploads - check for multi-track indicators in title
121        multi_track_indicators = ["ep", "album", "mixtape", "playlist"]
122        return any(indicator in title for indicator in multi_track_indicators)
123
124    # For unknown collections with audio mediatype, be conservative
125    # Default to single track unless we have strong evidence of multiple tracks
126    if mediatype == "audio":
127        # Look for numbering that suggests multiple parts/tracks
128        if re.search(r"\b(track|part|chapter)\s*\d+", title):
129            return True  # Likely part of a larger work
130        return bool(re.search(r"\b\d+\s*of\s*\d+\b", title))
131
132    return False
133
134
135def doc_to_audiobook(
136    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
137) -> Audiobook | None:
138    """
139    Convert Internet Archive document to Audiobook object.
140
141    Args:
142        doc: Internet Archive document metadata
143        domain: Provider domain
144        instance_id: Provider instance identifier
145        item_url_func: Function to generate item URLs
146
147    Returns:
148        Audiobook object or None if conversion fails
149    """
150    identifier = doc.get("identifier")
151    title = clean_text(doc.get("title"))
152    creator = clean_text(doc.get("creator"))
153
154    if not identifier or not title:
155        return None
156
157    audiobook = Audiobook(
158        item_id=identifier,
159        provider=instance_id,
160        name=title,
161        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
162    )
163
164    # Add author/narrator
165    if creator:
166        audiobook.authors.append(creator)
167
168    # Add metadata
169    if description := clean_text(doc.get("description")):
170        audiobook.metadata.description = description
171
172    # Add thumbnail
173    add_item_image(audiobook, identifier, instance_id)
174
175    return audiobook
176
177
178def doc_to_track(
179    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
180) -> Track | None:
181    """
182    Convert Internet Archive document to Track object.
183
184    Args:
185        doc: Internet Archive document metadata
186        domain: Provider domain
187        instance_id: Provider instance identifier
188        item_url_func: Function to generate item URLs
189
190    Returns:
191        Track object or None if conversion fails
192    """
193    identifier = doc.get("identifier")
194    title = clean_text(doc.get("title"))
195    creator = clean_text(doc.get("creator"))
196
197    if not identifier or not title:
198        return None
199
200    track = Track(
201        item_id=identifier,
202        provider=instance_id,
203        name=title,
204        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
205    )
206
207    # Add artist if available
208    if creator:
209        track.artists = UniqueList([create_artist(creator, domain, instance_id)])
210
211    # Add thumbnail
212    add_item_image(track, identifier, instance_id)
213
214    return track
215
216
217def doc_to_album(
218    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
219) -> Album | None:
220    """
221    Convert Internet Archive document to Album object.
222
223    Args:
224        doc: Internet Archive document metadata
225        domain: Provider domain
226        instance_id: Provider instance identifier
227        item_url_func: Function to generate item URLs
228
229    Returns:
230        Album object or None if conversion fails
231    """
232    identifier = doc.get("identifier")
233    title = clean_text(doc.get("title"))
234    creator = clean_text(doc.get("creator"))
235
236    if not identifier or not title:
237        return None
238
239    album = Album(
240        item_id=identifier,
241        provider=instance_id,
242        name=title,
243        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
244    )
245
246    # Add artist if available
247    if creator:
248        album.artists = UniqueList([create_artist(creator, domain, instance_id)])
249
250    # Add metadata
251    if date := extract_year(doc.get("date")):
252        album.year = date
253
254    if description := clean_text(doc.get("description")):
255        album.metadata.description = description
256
257    # Add thumbnail
258    add_item_image(album, identifier, instance_id)
259
260    # Add album type
261    album.album_type = AlbumType.ALBUM
262
263    return album
264
265
266def doc_to_artist(creator_name: str, domain: str, instance_id: str) -> Artist:
267    """Convert creator name to Artist object."""
268    return create_artist(creator_name, domain, instance_id)
269
270
271def create_title_from_identifier(identifier: str) -> str:
272    """Create a human-readable title from an Internet Archive identifier."""
273    return identifier.replace("_", " ").replace("-", " ").title()
274
275
276def artist_exists(artist: Artist, artists: list[Artist]) -> bool:
277    """Check if an artist already exists in the list to avoid duplicates."""
278    return any(existing.name == artist.name for existing in artists)
279
280
281def create_provider_mapping(
282    identifier: str, domain: str, instance_id: str, item_url_func: Callable[[str], str]
283) -> ProviderMapping:
284    """Create a standardized provider mapping for an item."""
285    return ProviderMapping(
286        item_id=identifier,
287        provider_domain=domain,
288        provider_instance=instance_id,
289        url=item_url_func(identifier),
290        available=True,
291    )
292
293
294def create_artist(creator_name: str, domain: str, instance_id: str) -> Artist:
295    """Create an Artist object from creator name."""
296    return Artist(
297        item_id=creator_name,
298        provider=instance_id,
299        name=creator_name,
300        provider_mappings={
301            ProviderMapping(
302                item_id=creator_name,
303                provider_domain=domain,
304                provider_instance=instance_id,
305            )
306        },
307    )
308
309
310def add_item_image(
311    item: Track | Album | Audiobook | Podcast, identifier: str, instance_id: str
312) -> None:
313    """Add thumbnail image to a media item if available."""
314    if thumb_url := get_image_url(identifier):
315        item.metadata.add_image(
316            MediaItemImage(
317                type=ImageType.THUMB,
318                path=thumb_url,
319                provider=instance_id,
320                remotely_accessible=True,
321            )
322        )
323
324
325def is_audiobook_content(doc: dict[str, Any]) -> bool:
326    """
327    Determine if an Internet Archive item is audiobook content.
328
329    Checks if the item is from a known audiobook collection.
330
331    Args:
332        doc: Internet Archive document metadata
333
334    Returns:
335        True if the item is from a known audiobook collection
336    """
337    collection = doc.get("collection", [])
338    if isinstance(collection, str):
339        collection = [collection]
340
341    return any(coll in AUDIOBOOK_COLLECTIONS for coll in collection)
342
343
344def doc_to_podcast(
345    doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
346) -> Podcast | None:
347    """
348    Convert Internet Archive document to Podcast object.
349
350    Args:
351        doc: Internet Archive document metadata
352        domain: Provider domain
353        instance_id: Provider instance identifier
354        item_url_func: Function to generate item URLs
355
356    Returns:
357        Podcast object or None if conversion fails
358    """
359    identifier = doc.get("identifier")
360    title = clean_text(doc.get("title"))
361    creator = clean_text(doc.get("creator"))
362
363    if not identifier or not title:
364        return None
365
366    podcast = Podcast(
367        item_id=identifier,
368        provider=instance_id,
369        name=title,
370        provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
371    )
372
373    # Add publisher/creator
374    if creator:
375        podcast.publisher = creator
376
377    # Add metadata
378    if description := clean_text(doc.get("description")):
379        podcast.metadata.description = description
380
381    # Add thumbnail
382    add_item_image(podcast, identifier, instance_id)
383
384    return podcast
385
386
387def is_podcast_content(doc: dict[str, Any]) -> bool:
388    """
389    Determine if an Internet Archive item is podcast content.
390
391    Args:
392        doc: Internet Archive document metadata
393
394    Returns:
395        True if the item is from a podcast collection
396    """
397    collection = doc.get("collection", [])
398    if isinstance(collection, str):
399        collection = [collection]
400
401    return "podcasts" in collection
402

1"""Metadata parsing utilities for the Internet Archive provider.""" 2 3from __future__ import annotations 4 5import re 6from collections.abc import Callable 7from typing import Any 8 9from music_assistant_models.enums import AlbumType, ImageType 10from music_assistant_models.media_items import ( 11 Album, 12 Artist, 13 Audiobook, 14 MediaItemImage, 15 Podcast, 16 ProviderMapping, 17 Track, 18) 19from music_assistant_models.unique_list import UniqueList 20 21from .constants import AUDIOBOOK_COLLECTIONS 22from .helpers import clean_text, extract_year, get_image_url 23 24 25def is_likely_album(doc: dict[str, Any]) -> bool: 26 """ 27 Determine if an Internet Archive item is likely an album using metadata heuristics. 28 29 Uses collection types, media types, title analysis, and file count hints to classify items 30 without making expensive API calls to check individual file counts. 31 32 Args: 33 doc: Internet Archive document metadata 34 35 Returns: 36 True if the item is likely an album, False if likely a single track 37 """ 38 mediatype = doc.get("mediatype", "") 39 collection = doc.get("collection", []) 40 title = clean_text(doc.get("title", "")).lower() 41 42 if isinstance(collection, str): 43 collection = [collection] 44 45 # etree collection items are almost always live concert albums 46 if "etree" in collection: 47 return True 48 49 # Skip obvious audiobook/speech collections - these are handled separately 50 if any(coll in AUDIOBOOK_COLLECTIONS for coll in collection): 51 return False 52 53 # Check for hints in the metadata that suggest multiple files 54 # Some IA items include file count information 55 if "files" in doc: 56 # If we have file info and it's more than 2-3 files, likely an album 57 # (accounting for derivative files like thumbnails) 58 try: 59 file_count = len(doc["files"]) if isinstance(doc["files"], list) else 0 60 if file_count > 3: # More than just 1-2 audio files + derivatives 61 return True 62 except (TypeError, KeyError): 63 pass 64 65 # Use title keywords to identify likely albums vs singles 66 album_indicators = [ 67 "album", 68 "live", 69 "concert", 70 "session", 71 "collection", 72 "compilation", 73 "complete", 74 "anthology", 75 "best of", 76 "greatest hits", 77 "discography", 78 "vol ", 79 "volume", 80 "part ", 81 "disc ", 82 "cd ", 83 "lp ", 84 ] 85 86 single_indicators = [ 87 "single", 88 "track", 89 "song", 90 "remix", 91 "edit", 92 "version", 93 "demo", 94 "instrumental", 95 "acoustic version", 96 ] 97 98 # Strong album indicators in title 99 if any(indicator in title for indicator in album_indicators): 100 return True 101 102 # Strong single indicators in title 103 if any(indicator in title for indicator in single_indicators): 104 return False 105 106 # Collection-specific logic 107 if "netlabels" in collection: 108 # Netlabel releases are usually albums/EPs 109 return True 110 111 if "78rpm" in collection: 112 # 78 RPM records are usually single tracks (A-side/B-side) 113 return False 114 115 if "oldtimeradio" in collection: 116 # Radio shows are usually single episodes, treat as tracks 117 return False 118 119 if "audio_music" in collection: 120 # General music uploads - check for multi-track indicators in title 121 multi_track_indicators = ["ep", "album", "mixtape", "playlist"] 122 return any(indicator in title for indicator in multi_track_indicators) 123 124 # For unknown collections with audio mediatype, be conservative 125 # Default to single track unless we have strong evidence of multiple tracks 126 if mediatype == "audio": 127 # Look for numbering that suggests multiple parts/tracks 128 if re.search(r"\b(track|part|chapter)\s*\d+", title): 129 return True # Likely part of a larger work 130 return bool(re.search(r"\b\d+\s*of\s*\d+\b", title)) 131 132 return False 133 134 135def doc_to_audiobook( 136 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str] 137) -> Audiobook | None: 138 """ 139 Convert Internet Archive document to Audiobook object. 140 141 Args: 142 doc: Internet Archive document metadata 143 domain: Provider domain 144 instance_id: Provider instance identifier 145 item_url_func: Function to generate item URLs 146 147 Returns: 148 Audiobook object or None if conversion fails 149 """ 150 identifier = doc.get("identifier") 151 title = clean_text(doc.get("title")) 152 creator = clean_text(doc.get("creator")) 153 154 if not identifier or not title: 155 return None 156 157 audiobook = Audiobook( 158 item_id=identifier, 159 provider=instance_id, 160 name=title, 161 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)}, 162 ) 163 164 # Add author/narrator 165 if creator: 166 audiobook.authors.append(creator) 167 168 # Add metadata 169 if description := clean_text(doc.get("description")): 170 audiobook.metadata.description = description 171 172 # Add thumbnail 173 add_item_image(audiobook, identifier, instance_id) 174 175 return audiobook 176 177 178def doc_to_track( 179 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str] 180) -> Track | None: 181 """ 182 Convert Internet Archive document to Track object. 183 184 Args: 185 doc: Internet Archive document metadata 186 domain: Provider domain 187 instance_id: Provider instance identifier 188 item_url_func: Function to generate item URLs 189 190 Returns: 191 Track object or None if conversion fails 192 """ 193 identifier = doc.get("identifier") 194 title = clean_text(doc.get("title")) 195 creator = clean_text(doc.get("creator")) 196 197 if not identifier or not title: 198 return None 199 200 track = Track( 201 item_id=identifier, 202 provider=instance_id, 203 name=title, 204 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)}, 205 ) 206 207 # Add artist if available 208 if creator: 209 track.artists = UniqueList([create_artist(creator, domain, instance_id)]) 210 211 # Add thumbnail 212 add_item_image(track, identifier, instance_id) 213 214 return track 215 216 217def doc_to_album( 218 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str] 219) -> Album | None: 220 """ 221 Convert Internet Archive document to Album object. 222 223 Args: 224 doc: Internet Archive document metadata 225 domain: Provider domain 226 instance_id: Provider instance identifier 227 item_url_func: Function to generate item URLs 228 229 Returns: 230 Album object or None if conversion fails 231 """ 232 identifier = doc.get("identifier") 233 title = clean_text(doc.get("title")) 234 creator = clean_text(doc.get("creator")) 235 236 if not identifier or not title: 237 return None 238 239 album = Album( 240 item_id=identifier, 241 provider=instance_id, 242 name=title, 243 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)}, 244 ) 245 246 # Add artist if available 247 if creator: 248 album.artists = UniqueList([create_artist(creator, domain, instance_id)]) 249 250 # Add metadata 251 if date := extract_year(doc.get("date")): 252 album.year = date 253 254 if description := clean_text(doc.get("description")): 255 album.metadata.description = description 256 257 # Add thumbnail 258 add_item_image(album, identifier, instance_id) 259 260 # Add album type 261 album.album_type = AlbumType.ALBUM 262 263 return album 264 265 266def doc_to_artist(creator_name: str, domain: str, instance_id: str) -> Artist: 267 """Convert creator name to Artist object.""" 268 return create_artist(creator_name, domain, instance_id) 269 270 271def create_title_from_identifier(identifier: str) -> str: 272 """Create a human-readable title from an Internet Archive identifier.""" 273 return identifier.replace("_", " ").replace("-", " ").title() 274 275 276def artist_exists(artist: Artist, artists: list[Artist]) -> bool: 277 """Check if an artist already exists in the list to avoid duplicates.""" 278 return any(existing.name == artist.name for existing in artists) 279 280 281def create_provider_mapping( 282 identifier: str, domain: str, instance_id: str, item_url_func: Callable[[str], str] 283) -> ProviderMapping: 284 """Create a standardized provider mapping for an item.""" 285 return ProviderMapping( 286 item_id=identifier, 287 provider_domain=domain, 288 provider_instance=instance_id, 289 url=item_url_func(identifier), 290 available=True, 291 ) 292 293 294def create_artist(creator_name: str, domain: str, instance_id: str) -> Artist: 295 """Create an Artist object from creator name.""" 296 return Artist( 297 item_id=creator_name, 298 provider=instance_id, 299 name=creator_name, 300 provider_mappings={ 301 ProviderMapping( 302 item_id=creator_name, 303 provider_domain=domain, 304 provider_instance=instance_id, 305 ) 306 }, 307 ) 308 309 310def add_item_image( 311 item: Track | Album | Audiobook | Podcast, identifier: str, instance_id: str 312) -> None: 313 """Add thumbnail image to a media item if available.""" 314 if thumb_url := get_image_url(identifier): 315 item.metadata.add_image( 316 MediaItemImage( 317 type=ImageType.THUMB, 318 path=thumb_url, 319 provider=instance_id, 320 remotely_accessible=True, 321 ) 322 ) 323 324 325def is_audiobook_content(doc: dict[str, Any]) -> bool: 326 """ 327 Determine if an Internet Archive item is audiobook content. 328 329 Checks if the item is from a known audiobook collection. 330 331 Args: 332 doc: Internet Archive document metadata 333 334 Returns: 335 True if the item is from a known audiobook collection 336 """ 337 collection = doc.get("collection", []) 338 if isinstance(collection, str): 339 collection = [collection] 340 341 return any(coll in AUDIOBOOK_COLLECTIONS for coll in collection) 342 343 344def doc_to_podcast( 345 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str] 346) -> Podcast | None: 347 """ 348 Convert Internet Archive document to Podcast object. 349 350 Args: 351 doc: Internet Archive document metadata 352 domain: Provider domain 353 instance_id: Provider instance identifier 354 item_url_func: Function to generate item URLs 355 356 Returns: 357 Podcast object or None if conversion fails 358 """ 359 identifier = doc.get("identifier") 360 title = clean_text(doc.get("title")) 361 creator = clean_text(doc.get("creator")) 362 363 if not identifier or not title: 364 return None 365 366 podcast = Podcast( 367 item_id=identifier, 368 provider=instance_id, 369 name=title, 370 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)}, 371 ) 372 373 # Add publisher/creator 374 if creator: 375 podcast.publisher = creator 376 377 # Add metadata 378 if description := clean_text(doc.get("description")): 379 podcast.metadata.description = description 380 381 # Add thumbnail 382 add_item_image(podcast, identifier, instance_id) 383 384 return podcast 385 386 387def is_podcast_content(doc: dict[str, Any]) -> bool: 388 """ 389 Determine if an Internet Archive item is podcast content. 390 391 Args: 392 doc: Internet Archive document metadata 393 394 Returns: 395 True if the item is from a podcast collection 396 """ 397 collection = doc.get("collection", []) 398 if isinstance(collection, str): 399 collection = [collection] 400 401 return "podcasts" in collection 402