/
/
/
1"""Metadata parsing utilities for the Internet Archive provider."""
2
3from __future__ import annotations
4
5import re
6from collections.abc import Callable
7from typing import Any
8
9from music_assistant_models.enums import AlbumType, ImageType
10from music_assistant_models.media_items import (
11 Album,
12 Artist,
13 Audiobook,
14 MediaItemImage,
15 Podcast,
16 ProviderMapping,
17 Track,
18)
19from music_assistant_models.unique_list import UniqueList
20
21from .constants import AUDIOBOOK_COLLECTIONS
22from .helpers import clean_text, extract_year, get_image_url
23
24
25def is_likely_album(doc: dict[str, Any]) -> bool:
26 """
27 Determine if an Internet Archive item is likely an album using metadata heuristics.
28
29 Uses collection types, media types, title analysis, and file count hints to classify items
30 without making expensive API calls to check individual file counts.
31
32 Args:
33 doc: Internet Archive document metadata
34
35 Returns:
36 True if the item is likely an album, False if likely a single track
37 """
38 mediatype = doc.get("mediatype", "")
39 collection = doc.get("collection", [])
40 title = clean_text(doc.get("title", "")).lower()
41
42 if isinstance(collection, str):
43 collection = [collection]
44
45 # etree collection items are almost always live concert albums
46 if "etree" in collection:
47 return True
48
49 # Skip obvious audiobook/speech collections - these are handled separately
50 if any(coll in AUDIOBOOK_COLLECTIONS for coll in collection):
51 return False
52
53 # Check for hints in the metadata that suggest multiple files
54 # Some IA items include file count information
55 if "files" in doc:
56 # If we have file info and it's more than 2-3 files, likely an album
57 # (accounting for derivative files like thumbnails)
58 try:
59 file_count = len(doc["files"]) if isinstance(doc["files"], list) else 0
60 if file_count > 3: # More than just 1-2 audio files + derivatives
61 return True
62 except (TypeError, KeyError):
63 pass
64
65 # Use title keywords to identify likely albums vs singles
66 album_indicators = [
67 "album",
68 "live",
69 "concert",
70 "session",
71 "collection",
72 "compilation",
73 "complete",
74 "anthology",
75 "best of",
76 "greatest hits",
77 "discography",
78 "vol ",
79 "volume",
80 "part ",
81 "disc ",
82 "cd ",
83 "lp ",
84 ]
85
86 single_indicators = [
87 "single",
88 "track",
89 "song",
90 "remix",
91 "edit",
92 "version",
93 "demo",
94 "instrumental",
95 "acoustic version",
96 ]
97
98 # Strong album indicators in title
99 if any(indicator in title for indicator in album_indicators):
100 return True
101
102 # Strong single indicators in title
103 if any(indicator in title for indicator in single_indicators):
104 return False
105
106 # Collection-specific logic
107 if "netlabels" in collection:
108 # Netlabel releases are usually albums/EPs
109 return True
110
111 if "78rpm" in collection:
112 # 78 RPM records are usually single tracks (A-side/B-side)
113 return False
114
115 if "oldtimeradio" in collection:
116 # Radio shows are usually single episodes, treat as tracks
117 return False
118
119 if "audio_music" in collection:
120 # General music uploads - check for multi-track indicators in title
121 multi_track_indicators = ["ep", "album", "mixtape", "playlist"]
122 return any(indicator in title for indicator in multi_track_indicators)
123
124 # For unknown collections with audio mediatype, be conservative
125 # Default to single track unless we have strong evidence of multiple tracks
126 if mediatype == "audio":
127 # Look for numbering that suggests multiple parts/tracks
128 if re.search(r"\b(track|part|chapter)\s*\d+", title):
129 return True # Likely part of a larger work
130 return bool(re.search(r"\b\d+\s*of\s*\d+\b", title))
131
132 return False
133
134
135def doc_to_audiobook(
136 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
137) -> Audiobook | None:
138 """
139 Convert Internet Archive document to Audiobook object.
140
141 Args:
142 doc: Internet Archive document metadata
143 domain: Provider domain
144 instance_id: Provider instance identifier
145 item_url_func: Function to generate item URLs
146
147 Returns:
148 Audiobook object or None if conversion fails
149 """
150 identifier = doc.get("identifier")
151 title = clean_text(doc.get("title"))
152 creator = clean_text(doc.get("creator"))
153
154 if not identifier or not title:
155 return None
156
157 audiobook = Audiobook(
158 item_id=identifier,
159 provider=instance_id,
160 name=title,
161 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
162 )
163
164 # Add author/narrator
165 if creator:
166 audiobook.authors.append(creator)
167
168 # Add metadata
169 if description := clean_text(doc.get("description")):
170 audiobook.metadata.description = description
171
172 # Add thumbnail
173 add_item_image(audiobook, identifier, instance_id)
174
175 return audiobook
176
177
178def doc_to_track(
179 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
180) -> Track | None:
181 """
182 Convert Internet Archive document to Track object.
183
184 Args:
185 doc: Internet Archive document metadata
186 domain: Provider domain
187 instance_id: Provider instance identifier
188 item_url_func: Function to generate item URLs
189
190 Returns:
191 Track object or None if conversion fails
192 """
193 identifier = doc.get("identifier")
194 title = clean_text(doc.get("title"))
195 creator = clean_text(doc.get("creator"))
196
197 if not identifier or not title:
198 return None
199
200 track = Track(
201 item_id=identifier,
202 provider=instance_id,
203 name=title,
204 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
205 )
206
207 # Add artist if available
208 if creator:
209 track.artists = UniqueList([create_artist(creator, domain, instance_id)])
210
211 # Add thumbnail
212 add_item_image(track, identifier, instance_id)
213
214 return track
215
216
217def doc_to_album(
218 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
219) -> Album | None:
220 """
221 Convert Internet Archive document to Album object.
222
223 Args:
224 doc: Internet Archive document metadata
225 domain: Provider domain
226 instance_id: Provider instance identifier
227 item_url_func: Function to generate item URLs
228
229 Returns:
230 Album object or None if conversion fails
231 """
232 identifier = doc.get("identifier")
233 title = clean_text(doc.get("title"))
234 creator = clean_text(doc.get("creator"))
235
236 if not identifier or not title:
237 return None
238
239 album = Album(
240 item_id=identifier,
241 provider=instance_id,
242 name=title,
243 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
244 )
245
246 # Add artist if available
247 if creator:
248 album.artists = UniqueList([create_artist(creator, domain, instance_id)])
249
250 # Add metadata
251 if date := extract_year(doc.get("date")):
252 album.year = date
253
254 if description := clean_text(doc.get("description")):
255 album.metadata.description = description
256
257 # Add thumbnail
258 add_item_image(album, identifier, instance_id)
259
260 # Add album type
261 album.album_type = AlbumType.ALBUM
262
263 return album
264
265
266def doc_to_artist(creator_name: str, domain: str, instance_id: str) -> Artist:
267 """Convert creator name to Artist object."""
268 return create_artist(creator_name, domain, instance_id)
269
270
271def create_title_from_identifier(identifier: str) -> str:
272 """Create a human-readable title from an Internet Archive identifier."""
273 return identifier.replace("_", " ").replace("-", " ").title()
274
275
276def artist_exists(artist: Artist, artists: list[Artist]) -> bool:
277 """Check if an artist already exists in the list to avoid duplicates."""
278 return any(existing.name == artist.name for existing in artists)
279
280
281def create_provider_mapping(
282 identifier: str, domain: str, instance_id: str, item_url_func: Callable[[str], str]
283) -> ProviderMapping:
284 """Create a standardized provider mapping for an item."""
285 return ProviderMapping(
286 item_id=identifier,
287 provider_domain=domain,
288 provider_instance=instance_id,
289 url=item_url_func(identifier),
290 available=True,
291 )
292
293
294def create_artist(creator_name: str, domain: str, instance_id: str) -> Artist:
295 """Create an Artist object from creator name."""
296 return Artist(
297 item_id=creator_name,
298 provider=instance_id,
299 name=creator_name,
300 provider_mappings={
301 ProviderMapping(
302 item_id=creator_name,
303 provider_domain=domain,
304 provider_instance=instance_id,
305 )
306 },
307 )
308
309
310def add_item_image(
311 item: Track | Album | Audiobook | Podcast, identifier: str, instance_id: str
312) -> None:
313 """Add thumbnail image to a media item if available."""
314 if thumb_url := get_image_url(identifier):
315 item.metadata.add_image(
316 MediaItemImage(
317 type=ImageType.THUMB,
318 path=thumb_url,
319 provider=instance_id,
320 remotely_accessible=True,
321 )
322 )
323
324
325def is_audiobook_content(doc: dict[str, Any]) -> bool:
326 """
327 Determine if an Internet Archive item is audiobook content.
328
329 Checks if the item is from a known audiobook collection.
330
331 Args:
332 doc: Internet Archive document metadata
333
334 Returns:
335 True if the item is from a known audiobook collection
336 """
337 collection = doc.get("collection", [])
338 if isinstance(collection, str):
339 collection = [collection]
340
341 return any(coll in AUDIOBOOK_COLLECTIONS for coll in collection)
342
343
344def doc_to_podcast(
345 doc: dict[str, Any], domain: str, instance_id: str, item_url_func: Callable[[str], str]
346) -> Podcast | None:
347 """
348 Convert Internet Archive document to Podcast object.
349
350 Args:
351 doc: Internet Archive document metadata
352 domain: Provider domain
353 instance_id: Provider instance identifier
354 item_url_func: Function to generate item URLs
355
356 Returns:
357 Podcast object or None if conversion fails
358 """
359 identifier = doc.get("identifier")
360 title = clean_text(doc.get("title"))
361 creator = clean_text(doc.get("creator"))
362
363 if not identifier or not title:
364 return None
365
366 podcast = Podcast(
367 item_id=identifier,
368 provider=instance_id,
369 name=title,
370 provider_mappings={create_provider_mapping(identifier, domain, instance_id, item_url_func)},
371 )
372
373 # Add publisher/creator
374 if creator:
375 podcast.publisher = creator
376
377 # Add metadata
378 if description := clean_text(doc.get("description")):
379 podcast.metadata.description = description
380
381 # Add thumbnail
382 add_item_image(podcast, identifier, instance_id)
383
384 return podcast
385
386
387def is_podcast_content(doc: dict[str, Any]) -> bool:
388 """
389 Determine if an Internet Archive item is podcast content.
390
391 Args:
392 doc: Internet Archive document metadata
393
394 Returns:
395 True if the item is from a podcast collection
396 """
397 collection = doc.get("collection", [])
398 if isinstance(collection, str):
399 collection = [collection]
400
401 return "podcasts" in collection
402