####################################################################### ## created by cooyou.org ####################################################################### import os import sys import re import numpy as np #import wav import librosa import soundfile as sf from pydub import AudioSegment import matplotlib.pyplot as plt import matplotlib.font_manager as fm from spleeter.separator import Separator from moviepy.editor import AudioFileClip from moviepy.editor import VideoFileClip #import subprocess np.set_printoptions(threshold=np.inf) def stft(x, frame_size, hop_size): window = np.hanning(frame_size) frames = [] for i in range(0, len(x) - frame_size, hop_size): frame = x[i:i + frame_size] * window frames.append(np.fft.rfft(frame)) return np.array(frames).T # shape: [freq_bins, time_frames] def istft(X, frame_size, hop_size): window = np.hanning(frame_size) time_frames = X.shape[1] x_len = frame_size + hop_size * (time_frames - 1) x = np.zeros(x_len) wsum = np.zeros(x_len) for n in range(time_frames): frame = np.fft.irfft(X[:, n]) start = n * hop_size x[start:start + frame_size] += frame * window wsum[start:start + frame_size] += window**2 # 正規化(重なり補正) nonzero = wsum > 1e-10 x[nonzero] /= wsum[nonzero] return x def phase_vocoder(S, rate): num_bins, num_frames = S.shape time_steps = np.arange(0, num_frames, rate) phi = np.angle(S[:, 0]) output = np.zeros((num_bins, len(time_steps)), dtype=np.complex64) for t_idx, step in enumerate(time_steps): i = int(np.floor(step)) frac = step - i if i + 1 >= num_frames: break mag = (1 - frac) * np.abs(S[:, i]) + frac * np.abs(S[:, i + 1]) delta = np.angle(S[:, i + 1]) - np.angle(S[:, i]) phi += delta output[:, t_idx] = mag * np.exp(1j * phi) return output def pitch_shift_audio(input_wav, output_wav, semitones, frame_size=1024, hop_size=256): # 音声読み込み y, sr = sf.read(input_wav) if y.ndim > 1: y = y.mean(axis=1) # モノラルに変換 # ピッチシフト倍率 rate = 2 ** (-semitones / 12.0) # STFT S = stft(y, frame_size, hop_size) # フェーズボコーダー S_shifted = phase_vocoder(S, rate) # 逆STFT y_stretched = istft(S_shifted, frame_size, hop_size) # 長さを元に戻す(リサンプル) len_orig = len(y) y_final = np.interp(np.linspace(0, 1, len_orig), np.linspace(0, 1, len(y_stretched)), y_stretched) # 書き出し sf.write(output_wav, y_final, sr) #def pitch_shift_audio1(input_wav, output_wav, semitones, sox_path=r"sox-14.4.2/sox.exe"): # cents = semitones * 100 # cmd = [sox_path, input_wav, output_wav, 'pitch', str(cents)] # subprocess.run(cmd, check=True) # def pitch_shift_audio3(input_path, output_path, semitone_shift): # librosaで読み込み y, sr = librosa.load(input_path, sr=None) # ピッチシフト y_shifted = librosa.effects.pitch_shift(y, sr, n_steps=semitone_shift) # 保存 sf.write(output_path, y_shifted, sr) def set_japanese_font(): # Windowsならメイリオを優先してセット try: font_list = fm.findSystemFonts(fontpaths=None, fontext='ttf') meiro_font = [f for f in font_list if 'meiryo' in f.lower()] if meiro_font: plt.rcParams['font.family'] = 'Meiryo' else: # 代替フォント plt.rcParams['font.family'] = 'Arial Unicode MS' except: pass # -------------------------------- # 指定音域文字列を解析する関数 G3:D6 形式 def parse_range(range_str): parts = range_str.split(':') if len(parts) != 2: raise ValueError("音域指定は「G3:D6」の形式で指定してください。") return parts[0], parts[1] def extract_mp3(input_file): ext = os.path.splitext(input_file)[1].lower() if ext == '.mp3': audio_path=os.path.join(os.path.dirname(input_file),"original.mp3") print(f"入力はmp3なのでコピーします: {input_file} --> {audio_path} ") if os.path.exists(audio_path)==False : with open(input_file, "rb") as src, open(audio_path, "wb") as dst: dst.write(src.read()) else: print(f"コピーは行われませんでした。既に{audio_path} は存在してます。") return audio_path elif ext == '.mp4': print(f"mp4からmp3を抽出: {input_file}") audio_path = os.path.join(os.path.dirname(input_file),"original.mp3") if not os.path.exists(audio_path): clip = VideoFileClip(input_file) # ← ここを AudioFileClip から VideoFileClip に変更 clip.audio.write_audiofile(audio_path, verbose=False, logger=None) clip.close() return audio_path else: raise ValueError("対応している拡張子はmp4かmp3のみです。") # -------------------------------- # spleeterでボーカルと伴奏に分離(wav出力) def separate_vocals(mp3_file): separator = Separator('spleeter:2stems') output_dir = os.path.dirname(mp3_file) separator.separate_to_file(mp3_file, output_dir, codec='mp3', bitrate='192k') base_name = os.path.splitext(os.path.basename(mp3_file))[0] vocal_wav1 = os.path.join(output_dir, base_name,'vocals.mp3') accom_wav1 = os.path.join(output_dir, base_name,'accompaniment.mp3') vocal_wav2 = os.path.join(output_dir, 'vocals.mp3') accom_wav2 = os.path.join(output_dir, 'accompaniment.mp3') if os.path.exists(vocal_wav2): os.remove(vocal_wav2) if os.path.exists(accom_wav2): os.remove(accom_wav2) os.rename(vocal_wav1, vocal_wav2) os.rename(accom_wav1, accom_wav2) os.rmdir(os.path.join(output_dir, base_name)) if not (os.path.exists(vocal_wav2) and os.path.exists(accom_wav2)): raise FileNotFoundError(f"分離された音声ファイルが見つかりません。{vocal_wav2} または {accom_wav2}") return vocal_wav2, accom_wav2 # -------------------------------- # ピッチ解析(librosaでF0抽出しMIDIに変換) def analyze_pitch(vocal_wav): y, sr = librosa.load(vocal_wav, sr=None) f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) times = librosa.times_like(f0, sr=sr) # F0(Hz) → MIDI番号 midi_pitches = np.array([librosa.hz_to_midi(f) if f is not None else np.nan for f in f0]) return times, midi_pitches # 音名→MIDI番号変換 NOTE_BASE = { 'C': 0, 'C#': 1, 'Db': 1, 'D': 2, 'D#': 3, 'Eb': 3, 'E': 4, 'F': 5, 'F#': 6, 'Gb': 6, 'G': 7, 'G#': 8, 'Ab': 8, 'A': 9, 'A#': 10, 'Bb': 10, 'B': 11 } def note_to_midi(note_str: str) -> int: """ 音名(例:"C4", "G#3", "Bb2")をMIDI番号に変換する。 """ note_str = note_str.strip() if len(note_str) < 2: raise ValueError(f"Invalid note format: {note_str}") if note_str[1] in ['#', 'b']: note_name = note_str[:2] octave_str = note_str[2:] else: note_name = note_str[0] octave_str = note_str[1:] if note_name not in NOTE_BASE: raise ValueError(f"Invalid note name: {note_name}") try: octave = int(octave_str) except: raise ValueError(f"Invalid octave number: {octave_str}") midi_num = (octave + 1) * 12 + NOTE_BASE[note_name] if midi_num < 0 or midi_num > 127: raise ValueError(f"MIDI number out of range: {midi_num} from note {note_str}") return midi_num # 累積時間計算 def calc_cumulative_time(times, midi_pitches, shift=0): pitches = np.array(midi_pitches) + shift times = np.array(times) valid_idx = ~np.isnan(pitches) pitches = pitches[valid_idx].astype(int) times = times[valid_idx] dt = np.diff(times) dt = np.append(dt, np.mean(dt) if len(dt) > 0 else 0) pitch_time_dict = {} for p, delta_t in zip(pitches, dt): pitch_time_dict[p] = pitch_time_dict.get(p, 0) + delta_t return pitch_time_dict def calculate_shift(midi_pitches, times, target_min_note, target_max_note): th=0.1 #最大値の10% #print(midi_pitches,times,target_min_note,target_max_note) pitches = np.array(midi_pitches) # 中心音高の探索(最大値含む) peak_pitch = np.argmax(pitches) peak_value = np.max(pitches) print("peak_pitch",peak_pitch) print("peak_value",peak_value) # min_shift: 左方向にゼロでない最初の位置(peak_pitch含む) min_pitch = peak_pitch cnt=0 while min_pitch > 0 and cnt<=1: if pitches[min_pitch]/peak_value < th: cnt+=1 min_pitch -= 1 min_pitch += 1 # ゼロ地点を超えたところに戻す print("min_pitch",min_pitch) # max_pitch: 右方向にゼロでない最初の位置(peak_pitch含む) max_pitch = peak_pitch cnt=0 while max_pitch < 127 and cnt<=1: if pitches[max_pitch]/peak_value < th : cnt+=1 max_pitch += 1 max_pitch -= 1 print("max_pitch",max_pitch) # 範囲の中心と目標の中心を比較して、必要なシフトを算出 actual_center = (min_pitch + max_pitch) // 2 target_center = (target_min_note + target_max_note) // 2 print("actual_center",actual_center) print("target_center",target_center) shift_needed = target_center - actual_center print("shift_needed",shift_needed) return shift_needed # アルファベット名の生成(#を除く) note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] def midi_to_name(n): name = note_names[n % 12] octave = n // 12 - 1 return name + str(octave) def plot_pitch_histogram(times, midi_pitches, title, filename, target_min_note, target_max_note): set_japanese_font() pitches = np.array(midi_pitches) times = np.array(times) valid_idx = ~np.isnan(pitches) pitches = pitches[valid_idx] times = times[valid_idx] if len(pitches) == 0: print("有効な音程データがありません。") return pitches_int = np.floor(pitches).astype(int) dt = np.diff(times) if len(dt) == 0: print("timesデータが不十分です。") return dt = np.append(dt, np.mean(dt)) pitch_time_dict = {} for p, delta_t in zip(pitches_int, dt): pitch_time_dict[p] = pitch_time_dict.get(p, 0) + delta_t minx = 20 maxx = 100 all_pitches = np.arange(minx, maxx + 1) times_per_pitch = [pitch_time_dict.get(p, 0) for p in all_pitches] # ラベルを2行表示:上が番号、下が名前(白鍵のみ) labels = [] for n in all_pitches: name = midi_to_name(n) if '#' in name: labels.append(f"{n}\n") else: labels.append(f"{n}\n{name}") plt.figure(figsize=(12, 6)) plt.bar(all_pitches, times_per_pitch, color='skyblue', edgecolor='black') plt.xlabel("MIDIノート番号 / 音名") plt.ylabel("累積使用時間(秒)") plt.title(title) plt.grid(axis='y') plt.xlim(minx, maxx) plt.xticks(all_pitches, labels, rotation=0, fontsize=8) plt.axvspan(target_min_note, target_max_note, color='orange', alpha=0.2) plt.tight_layout() plt.savefig(filename) plt.close() print(f"ヒストグラム保存: {filename}") def create_pitch_usage_vector(midi_pitches): pitch_usage = np.zeros(128) for pitch in midi_pitches: if not np.isnan(pitch) and 0 <= int(pitch) < 128: pitch_usage[int(pitch)] += 1 return pitch_usage def find_first_media_file(path): # パスがフォルダか確認 if not os.path.isdir(path): return path # フォルダでなければ # フォルダ直下のファイルを取得 for filename in os.listdir(path): full_path = os.path.join(path, filename) if os.path.isfile(full_path) and (filename.lower().endswith('.mp3')): return path+"/"+filename # 最初に見つけたファイル名を返す if os.path.isfile(full_path) and (filename.lower().endswith('.mp4')): return path+"/"+filename # 最初に見つけたファイル名を返す return None # 見つからなければ None # -------------------------------- # メイン処理関数 def main(): if len(sys.argv) < 2: print("使い方: python script.py 入力ファイルまたはフォルダ [指定音域(例:G3:D6 または 48:74)]") #35:64 B1:F4 sys.exit(1) input_file = find_first_media_file(sys.argv[1]) target_range = sys.argv[2] if len(sys.argv) > 2 else "G3:D6" print(f"入力ファイル: {input_file}") print(f"指定音域: {target_range}") target_min_note=20 target_max_note=100 part1, part2 = target_range.split(":") if part1.isdigit() and part2.isdigit() : target_min_note=int(part1) target_max_note=int(part2) else: target_min_midi, target_max_midi = parse_range(target_range) target_min_note=note_to_midi(target_min_midi) target_max_note=note_to_midi(target_max_midi) # mp4/mp3判定してmp3抽出 mp3_file = extract_mp3(input_file) base_name=os.path.dirname(mp3_file) # ボーカル抽出 vocal_wav, accom_wav = separate_vocals(mp3_file) # 音程解析 times, midi_pitches = analyze_pitch(vocal_wav) # --- 元のボーカル音でのグラフ --- graph_file_original = os.path.join(base_name , "graph_original.jpg") plot_pitch_histogram(times, midi_pitches, title=f"{base_name} の音域 , 指定音域 {target_range}", filename=graph_file_original, target_min_note=target_min_note, target_max_note=target_max_note) pitch_usage = create_pitch_usage_vector(midi_pitches) #print(pitch_usage) # キーシフト計算 shiftorg = calculate_shift(pitch_usage, times, target_min_note, target_max_note) octave_shift, shift = divmod(shiftorg, 12) if shift>=7: octave_shift+=1 shift=shift-12 print("オクターブ(=12キー)シフト",octave_shift) print("キーシフト",shift) # 元mp3キーシフトして保存(librosaはwavのみなので、一旦wav変換→シフト→mp3) temp_original_wav = os.path.join(base_name , "temp.wav") AudioSegment.from_file(mp3_file).export(temp_original_wav, format="wav") shifted_original_wav = os.path.join(base_name ,f"shift{shift:+d}.wav") pitch_shift_audio(temp_original_wav, shifted_original_wav, shift) shifted_original_mp3 = os.path.join(base_name ,f"shift{shift:+d}.mp3") AudioSegment.from_wav(shifted_original_wav).export(shifted_original_mp3, format="mp3") os.remove(temp_original_wav) os.remove(shifted_original_wav) # ボーカルwavキーシフトしてmp3保存 shifted_vocal_wav = os.path.join(base_name ,f"vocals_shift{shift:+d}.wav") pitch_shift_audio(vocal_wav, shifted_vocal_wav, shift) shifted_vocal_mp3 = os.path.join(base_name ,f"vocals_shift{shift:+d}.mp3") AudioSegment.from_wav(shifted_vocal_wav).export(shifted_vocal_mp3, format="mp3") os.remove(shifted_vocal_wav) # 伴奏wavキーシフトしてmp3保存 shifted_accom_wav = os.path.join(base_name ,f"accompaniment_shift{shift:+d}.wav") pitch_shift_audio(accom_wav, shifted_accom_wav, shift) shifted_accom_mp3 = os.path.join(base_name ,f"accompaniment_shift{shift:+d}.mp3") AudioSegment.from_wav(shifted_accom_wav).export(shifted_accom_mp3, format="mp3") os.remove(shifted_accom_wav) # --- キーシフト後のグラフ --- graph_file_shifted = os.path.join(base_name , f"graph_shift{shift:+d}.jpg") shifted_midi_pitches = midi_pitches + shiftorg # シフト量を足すだけ(numpyなら要素ごとに加算) plot_pitch_histogram(times, shifted_midi_pitches, title=f"{base_name} の音域 , 指定音域 {target_range} , オクターブ(=12キー)シフト {octave_shift} , キーシフト {shift:+d}", filename=graph_file_shifted, target_min_note=target_min_note, target_max_note=target_max_note) print("処理完了") if __name__ == "__main__": main()