YouTubeScript/app.py at main · Muavia1/YouTubeScript · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import whisper
import os
from transformers import pipeline

# Load models
model = whisper.load_model("base")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def write_vtt(result, filename="output.vtt"):
    with open(filename, 'w') as srt_file:
        for idx, segment in enumerate(result['segments'], start=1):
            start = format_vtt_timestamp(segment['start'])
            end = format_vtt_timestamp(segment['end'])
            text = segment['text']
            srt_file.write(f"{idx}\n{start} --> {end}\n{text}\n\n")


def format_vtt_timestamp(seconds):
    milliseconds = int((seconds % 1) * 1000)
    seconds = int(seconds)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"

def inference(choice, link, mode, selected_language, audio_file):
    output_file = "audio.mp3"

    # Remove previous audio file if exists
    if os.path.exists(output_file):
        os.remove(output_file)

    if choice == "YouTube Video":
        if not link:
            return "Please provide a YouTube link.", None
        os.system(f"yt-dlp -x --audio-format mp3 -o '{output_file}' {link}")
    elif choice == "Audio File":
        if not audio_file:
            return "Please upload an audio file.", None
        audio_path = audio_file.name
        os.rename(audio_path, output_file)

    # Perform transcription
    if not mode or mode == "Original":
        result = model.transcribe(output_file, word_timestamps=True)
    else:
        result = model.transcribe(
            output_file, task="translate", language=selected_language.lower(), word_timestamps=True
        )

    # Summarize the transcript
    transcript_text = result['text']
    if len(transcript_text.split()) > 20:  # Ensure there's enough content for summarization
        summary = summarizer(transcript_text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
    else:
        summary = transcript_text
    vtt_file = "sub.vtt"

    if os.path.exists(vtt_file):
        os.remove(vtt_file)

    write_vtt(result, vtt_file)

    return summary, 'sub.vtt'


# List of available languages in Whisper (excluding the original language)
available_languages = [lang.capitalize() for lang in whisper.tokenizer.LANGUAGES.keys() if lang != "english"]

title = "YouTubeScript"
description = "Get Any YouTube Video or Audio File Transcript!!!"
block = gr.Blocks()

with block:
    gr.Markdown(
        """# YouTubeScript
Get your YouTube transcription or upload audio files easily!
        """
    )

    # Choice selection: YouTube or Audio File
    with gr.Row():
        choice = gr.Radio(["YouTube Video", "Audio File"], label="Choose Input Type")

    # YouTube input
    youtube_link = gr.Textbox(label="YouTube Video Link", visible=False)
    choice.change(lambda x: gr.update(visible=(x == "YouTube Video")), inputs=choice, outputs=youtube_link)

    # Audio file input
    audio_file = gr.File(label="Upload Audio File", visible=False)
    audio_player = gr.Audio(label="Play Uploaded Audio", type="filepath", visible=False)

    def show_audio_player(file):
        if file:
            return gr.update(visible=True, value=file.name)
        return gr.update(visible=False)

    choice.change(lambda x: gr.update(visible=(x == "Audio File")), inputs=choice, outputs=[audio_file])
    audio_file.change(show_audio_player, inputs=audio_file, outputs=audio_player)

    # Mode selection and language dropdown
    with gr.Row():
        mode = gr.Radio(["Original", "Translate"], label="Mode", value=None)
    language_dropdown = gr.Dropdown(available_languages, label="Select Language for Translation", visible=False)

    def toggle_language_dropdown(mode):
        return gr.update(visible=(mode == "Translate"))

    mode.change(toggle_language_dropdown, inputs=[mode], outputs=[language_dropdown])

    btn = gr.Button("Get YouTubeScript 🪄")

    text = gr.Textbox(
        label="Transcript Summary",
        placeholder="Summary Output",
        lines=5
    )

    # Transcript file for download
    transcription = gr.File()

    # Clear button functionality
    def clear_fields():
        return gr.update(value=None),gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(visible=False, value=None), gr.update(value=""), gr.update(value=None), gr.update(visible=False)

    clear_btn = gr.Button("Clear Fields 🧹")

    btn.click(inference, inputs=[choice, youtube_link, mode, language_dropdown, audio_file], outputs=[text, transcription])
    clear_btn.click(clear_fields, outputs=[choice,youtube_link, audio_file, mode, language_dropdown, text, transcription, audio_player])

block.launch(debug=True,share=True)