Merge branch 'fix/musify_scrapes_year_as_artist' into experimental

This commit is contained in:
Hazel 2024-05-10 17:52:11 +02:00
commit 04405f88eb
17 changed files with 134 additions and 85 deletions

22
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,22 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
},
{
"name": "Python Debugger: Download script",
"type": "debugpy",
"request": "launch",
"program": "development/actual_donwload.py",
"console": "integratedTerminal"
}
]
}

View File

@ -19,6 +19,7 @@
"albumsort",
"APIC",
"Bandcamp",
"bitrate",
"dotenv",
"encyclopaedia",
"ENDC",
@ -27,6 +28,7 @@
"metallum",
"musify",
"OKBLUE",
"OKGREEN",
"pathvalidate",
"Referer",
"sponsorblock",

View File

@ -6,9 +6,8 @@ logging.getLogger().setLevel(logging.DEBUG)
if __name__ == "__main__":
commands = [
"s: #a Crystal F",
"10",
"2",
"s: #a Psychonaut 4",
"d: 0",
]

View File

@ -47,7 +47,7 @@ def correct_codec(target: Target, bitrate_kb: int = main_settings["bitrate"], au
# run the ffmpeg command with a progressbar
ff = FfmpegProgress(ffmpeg_command)
with tqdm(total=100, desc=f"removing {len(interval_list)} segments") as pbar:
with tqdm(total=100, desc=f"processing") as pbar:
for progress in ff.run_command_with_progress():
pbar.update(progress-pbar.n)

View File

@ -166,9 +166,9 @@ class Downloader:
self.genre = genre or get_genre()
self.process_metadata_anyway = process_metadata_anyway
print()
print(f"Downloading to: \"{self.genre}\"")
print()
output()
output(f"Downloading to: \"{self.genre}\"", color=BColors.HEADER)
output()
def print_current_options(self):
self.page_dict = dict()
@ -312,10 +312,8 @@ class Downloader:
def download(self, data_objects: List[DatabaseObject], **kwargs) -> bool:
output()
if len(data_objects) == 1:
output(f"Downloading {data_objects[0].option_string}...", color=BColors.BOLD)
else:
output(f"Downloading {len(data_objects)} objects...", *("- " + o.option_string for o in data_objects), color=BColors.BOLD, sep="\n")
if len(data_objects) > 1:
output(f"Downloading {len(data_objects)} objects...", *("- " + o.option_string for o in data_objects), color=BColors.BOLD, sep="\n")
_result_map: Dict[DatabaseObject, DownloadResult] = dict()

View File

@ -118,7 +118,7 @@ class Pages:
audio_pages = self._audio_pages_set.intersection(_page_types)
for download_page in audio_pages:
return self._page_instances[download_page].download(music_object=music_object, genre=genre, download_all=download_all, process_metadata_anyway=process_metadata_anyway)
return self._page_instances[download_page].download(music_object=music_object, genre=genre)
return DownloadResult(error_message=f"No audio source has been found for {music_object}.")

View File

@ -153,6 +153,8 @@ class Collection(Generic[T]):
if other is None:
return
if not other._inner._has_data:
return
if other.id in self._indexed_from_id:
return

View File

@ -32,7 +32,7 @@ class FormattedText:
if self.is_empty and other.is_empty:
return True
return self.doc == other.doc
return self.html == other.html
@property
def markdown(self) -> str:

View File

@ -30,6 +30,8 @@ class InnerData:
_refers_to_instances: set = None
_is_in_collection: set = None
_has_data: bool = False
"""
Attribute versions keep track, of if the attribute has been changed.
"""
@ -48,9 +50,19 @@ class InnerData:
for key, value in kwargs.items():
if hasattr(value, "__is_collection__"):
value._collection_for[self] = key
self.__setattr__(key, value)
if self._has_data:
continue
def __setattr__(self, key: str, value):
if self._has_data or not hasattr(self, "_default_values"):
return super().__setattr__(key, value)
super().__setattr__("_has_data", not (key in self._default_values and self._default_values[key] == value))
return super().__setattr__(key, value)
def __hash__(self):
return self.id

View File

@ -111,7 +111,7 @@ class Song(Base):
"album_collection": Collection,
"feature_artist_collection": Collection,
"title": lambda: "",
"title": lambda: None,
"unified_title": lambda: None,
"isrc": lambda: None,
"genre": lambda: None,

View File

@ -3,8 +3,9 @@ import random
import re
from copy import copy
from pathlib import Path
from typing import Optional, Union, Type, Dict, Set, List, Tuple
from typing import Optional, Union, Type, Dict, Set, List, Tuple, TypedDict
from string import Formatter
from dataclasses import dataclass, field
import requests
from bs4 import BeautifulSoup
@ -28,11 +29,23 @@ from ..utils.config import main_settings
from ..utils.support_classes.query import Query
from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import fit_to_file_system
from ..utils import trace
from ..utils import trace, output, BColors
INDEPENDENT_DB_OBJECTS = Union[Label, Album, Artist, Song]
INDEPENDENT_DB_TYPES = Union[Type[Song], Type[Album], Type[Artist], Type[Label]]
@dataclass
class FetchOptions:
download_all: bool = False
album_type_blacklist: Set[AlbumType] = field(default_factory=lambda: set(AlbumType(a) for a in main_settings["album_type_blacklist"]))
@dataclass
class DownloadOptions:
download_all: bool = False
album_type_blacklist: Set[AlbumType] = field(default_factory=lambda: set(AlbumType(a) for a in main_settings["album_type_blacklist"]))
process_audio_if_found: bool = False
process_metadata_if_found: bool = True
class NamingDict(dict):
CUSTOM_KEYS: Dict[str, str] = {
@ -101,6 +114,10 @@ class Page:
# set this to true, if all song details can also be fetched by fetching album details
NO_ADDITIONAL_DATA_FROM_SONG = False
def __init__(self, download_options: DownloadOptions = None, fetch_options: FetchOptions = None):
self.download_options: DownloadOptions = download_options or DownloadOptions()
self.fetch_options: FetchOptions = fetch_options or FetchOptions()
def _search_regex(self, pattern, string, default=None, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
@ -176,7 +193,6 @@ class Page:
self,
music_object: DatabaseObject,
stop_at_level: int = 1,
post_process: bool = True
) -> DatabaseObject:
"""
when a music object with lacking data is passed in, it returns
@ -208,7 +224,6 @@ class Page:
source=source,
enforce_type=type(music_object),
stop_at_level=stop_at_level,
post_process=False,
type_string=type(music_object).__name__,
entity_string=music_object.option_string,
)
@ -230,7 +245,6 @@ class Page:
source: Source,
stop_at_level: int = 2,
enforce_type: Type[DatabaseObject] = None,
post_process: bool = True,
type_string: str = "",
entity_string: str = "",
) -> Optional[DatabaseObject]:
@ -268,7 +282,7 @@ class Page:
for sub_element in collection:
sub_element.merge(
self.fetch_details(sub_element, stop_at_level=stop_at_level - 1, post_process=False))
self.fetch_details(sub_element, stop_at_level=stop_at_level - 1))
return music_object
@ -288,8 +302,6 @@ class Page:
self,
music_object: DatabaseObject,
genre: str,
download_all: bool = False,
process_metadata_anyway: bool = True
) -> DownloadResult:
naming_dict: NamingDict = NamingDict({"genre": genre})
@ -308,25 +320,22 @@ class Page:
fill_naming_objects(music_object)
return self._download(music_object, naming_dict, download_all, process_metadata_anyway=process_metadata_anyway)
return self._download(music_object, naming_dict)
def _download(
self,
music_object: DatabaseObject,
naming_dict: NamingDict,
download_all: bool = False,
skip_details: bool = False,
process_metadata_anyway: bool = True
**kwargs
) -> DownloadResult:
trace(f"downloading {type(music_object).__name__} [{music_object.option_string}]")
skip_next_details = skip_details
if isinstance(music_object, Song):
output(f"Downloading {music_object.option_string} to:", color=BColors.BOLD)
else:
output(f"Downloading {music_object.option_string}...", color=BColors.BOLD)
# Skips all releases, that are defined in shared.ALBUM_TYPE_BLACKLIST, if download_all is False
if isinstance(music_object, Album):
if self.NO_ADDITIONAL_DATA_FROM_SONG:
skip_next_details = True
if not download_all and music_object.album_type.value in main_settings["album_type_blacklist"]:
if not self.download_options.download_all and music_object.album_type in self.download_options.album_type_blacklist:
return DownloadResult()
if not (isinstance(music_object, Song) and self.NO_ADDITIONAL_DATA_FROM_SONG):
@ -338,7 +347,7 @@ class Page:
naming_dict.add_object(music_object)
if isinstance(music_object, Song):
return self._download_song(music_object, naming_dict, process_metadata_anyway=process_metadata_anyway)
return self._download_song(music_object, naming_dict)
download_result: DownloadResult = DownloadResult()
@ -347,13 +356,11 @@ class Page:
sub_ordered_music_object: DatabaseObject
for sub_ordered_music_object in collection:
download_result.merge(self._download(sub_ordered_music_object, naming_dict.copy(), download_all,
skip_details=skip_next_details,
process_metadata_anyway=process_metadata_anyway))
download_result.merge(self._download(sub_ordered_music_object, naming_dict.copy()))
return download_result
def _download_song(self, song: Song, naming_dict: NamingDict, process_metadata_anyway: bool = True):
def _download_song(self, song: Song, naming_dict: NamingDict):
if "genre" not in naming_dict and song.genre is not None:
naming_dict["genre"] = song.genre
@ -373,40 +380,33 @@ class Page:
if song.target_collection.empty:
song.target_collection.append(new_target)
r = DownloadResult(1)
temp_target: Target = Target.temp()
found_on_disc = False
target: Target
for target in song.target_collection:
current_exists = target.exists
if current_exists:
output(f'- {target.file_path} {BColors.OKGREEN.value}[already exists]', color=BColors.GREY)
target.copy_content(temp_target)
found_on_disc = True
r.found_on_disk += 1
r.add_target(target)
else:
output(f'- {target.file_path}', color=BColors.GREY)
if not song.source_collection.has_source_page(self.SOURCE_TYPE):
return DownloadResult(error_message=f"No {self.__class__.__name__} source found for {song.option_string}.")
sources = song.source_collection.get_sources(self.SOURCE_TYPE)
temp_target: Target = Target(
relative_to_music_dir=False,
file_path=Path(
main_settings["temp_directory"],
str(song.id)
)
)
r = DownloadResult(1)
found_on_disc = False
target: Target
for target in song.target_collection:
if target.exists:
if process_metadata_anyway:
target.copy_content(temp_target)
found_on_disc = True
r.found_on_disk += 1
r.add_target(target)
if found_on_disc and not process_metadata_anyway:
self.LOGGER.info(f"{song.option_string} already exists, thus not downloading again.")
return r
skip_intervals = []
if not found_on_disc:
for source in sources:
r = self.download_song_to_target(source=source, target=temp_target, desc=song.option_string)
r = self.download_song_to_target(source=source, target=temp_target, desc="downloading")
if not r.is_fatal_error:
skip_intervals = self.get_skip_intervals(song, source)
@ -417,16 +417,19 @@ class Page:
song=song,
temp_target=temp_target,
interval_list=skip_intervals,
found_on_disc=found_on_disc,
))
return r
def _post_process_targets(self, song: Song, temp_target: Target, interval_list: List) -> DownloadResult:
correct_codec(temp_target, interval_list=interval_list)
def _post_process_targets(self, song: Song, temp_target: Target, interval_list: List, found_on_disc: bool) -> DownloadResult:
if not found_on_disc or self.download_options.process_audio_if_found:
correct_codec(temp_target, interval_list=interval_list)
self.post_process_hook(song, temp_target)
write_metadata_to_target(song.metadata, temp_target, song)
if not found_on_disc or self.download_options.process_metadata_if_found:
write_metadata_to_target(song.metadata, temp_target, song)
r = DownloadResult()

View File

@ -22,6 +22,7 @@ from ..objects import (
Artwork,
)
from ..connection import Connection
from ..utils import dump_to_file
from ..utils.support_classes.download_result import DownloadResult
from ..utils.string_processing import clean_song_title
from ..utils.config import main_settings, logging_settings

View File

@ -418,6 +418,10 @@ class Musify(Page):
href = artist_soup["href"]
if href is not None:
href_parts = href.split("/")
if len(href_parts) <= 1 or href_parts[-2] != "artist":
return
artist_src_list.append(Source(self.SOURCE_TYPE, self.HOST + href))
name_elem: BeautifulSoup = artist_soup.find("span", {"itemprop": "name"})
@ -681,17 +685,20 @@ class Musify(Page):
anchor: BeautifulSoup = artist_crumb.find("a")
if anchor is not None:
href = anchor.get("href")
artist_source_list: List[Source] = []
if href is not None:
artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip()))
href_parts = href.split("/")
if not(len(href_parts) <= 1 or href_parts[-2] != "artist"):
artist_source_list: List[Source] = []
span: BeautifulSoup = anchor.find("span")
if span is not None:
artist_list.append(Artist(
name=span.get_text(strip=True),
source_list=artist_source_list
))
if href is not None:
artist_source_list.append(Source(self.SOURCE_TYPE, self.HOST + href.strip()))
span: BeautifulSoup = anchor.find("span")
if span is not None:
artist_list.append(Artist(
name=span.get_text(strip=True),
source_list=artist_source_list
))
else:
self.LOGGER.debug("there are not 4 breadcrumb items, which shouldn't be the case")
@ -938,10 +945,10 @@ class Musify(Page):
album_status_id = album_card.get("data-type")
if album_status_id.isdigit():
album_status_id = int(album_status_id)
album_type = ALBUM_TYPE_MAP[album_status_id]
album_kwargs["album_type"] = ALBUM_TYPE_MAP[album_status_id]
if album_status_id == 5:
album_status = AlbumStatus.BOOTLEG
album_kwargs["album_status"] = AlbumStatus.BOOTLEG
def parse_release_anchor(_anchor: BeautifulSoup, text_is_name=False):
nonlocal album_kwargs
@ -1037,7 +1044,7 @@ class Musify(Page):
for card_soup in soup.find_all("div", {"class": "card"}):
album = self._parse_album_card(card_soup, artist_name, **kwargs)
if album.album_type in _album_type_blacklist:
if not self.fetch_options.download_all and album.album_type in self.fetch_options.album_type_blacklist:
continue
artist.main_album_collection.append(album)

View File

@ -42,7 +42,7 @@ class YouTube(SuperYouTube):
SOURCE_TYPE = SourcePages.YOUTUBE
LOGGER = logging_settings["youtube_logger"]
NO_ADDITIONAL_DATA_FROM_SONG = True
NO_ADDITIONAL_DATA_FROM_SONG = False
def __init__(self, *args, **kwargs):
self.connection: Connection = Connection(

View File

@ -12,7 +12,7 @@ if not load_dotenv(Path(__file__).parent.parent.parent / ".env"):
__stage__ = os.getenv("STAGE", "prod")
DEBUG = (__stage__ == "dev") and True
DEBUG = (__stage__ == "dev") and False
DEBUG_LOGGING = DEBUG and False
DEBUG_TRACE = DEBUG and True
DEBUG_OBJECT_TRACE = DEBUG and False

View File

@ -116,10 +116,13 @@ def clean_song_title(raw_song_title: str, artist_name: Optional[str] = None) ->
# Remove artist from the start of the title
if raw_song_title.lower().startswith(artist_name.lower()):
raw_song_title = raw_song_title[len(artist_name):].strip()
if raw_song_title.startswith("-"):
raw_song_title = raw_song_title[1:].strip()
possible_new_name = raw_song_title[len(artist_name):].strip()
for char in ("-", "", ":", "|"):
if possible_new_name.startswith(char):
raw_song_title = possible_new_name[1:].strip()
break
return raw_song_title.strip()

View File

@ -69,7 +69,7 @@ dependencies = [
"toml~=0.10.2",
"typing_extensions~=4.7.1",
"python-sponsorblock~=0.0.0",
"python-sponsorblock~=0.0.dev1",
"youtube_dl",
]
dynamic = [