Youtube channel scraper

In [ ]:

  Copied!     
 
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import sys from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import time

In [ ]:

  Copied!     
 
def scrape_youtube_channel(url):
    # Determine the video type based on the URL
    if url.split('/')[-1] == 'videos':
        video_type = "videos"
    else:
        video_type = "shorts"

    # Create a new instance of the Edge driver
    driver = webdriver.Edge()

    # Open the YouTube channel page
    driver.get(url)
    time.sleep(5)
    last_height = driver.execute_script("return document.documentElement.scrollHeight")

    # Scroll to the bottom of the page to load all videos
    while True:
        # Scroll down to the bottom
        print("Scrolling down...")
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)

        # Check if the page height has increased
        driver.implicitly_wait(3)
        time.sleep(3)  # Adjust the wait time as needed

        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break  # End of page reached
        last_height = new_height

    # Get the page source after scrolling
    page_source = driver.page_source

    # Close the webdriver
    driver.quit()

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find all links with the specified attributes
    if video_type == "shorts":
        video_links = soup.find_all('a', {'id': 'thumbnail', 'class': 'yt-simple-endpoint inline-block style-scope ytd-thumbnail'})
    elif video_type == "videos":
        video_links = soup.find_all('a', {'id': 'thumbnail', 'class': 'yt-simple-endpoint style-scope ytd-playlist-thumbnail'})

    # Extract and print the href attributes
    channel = url.split('/')[-2]

    with open(f'{channel}-{video_type}.txt', 'w') as f:
        for link in video_links:
            href = link.get('href')
            if href:
                f.write(f"https://www.youtube.com{href}\n")
def scrape_youtube_channel(url): # Determine the video type based on the URL if url.split('/')[-1] == 'videos': video_type = "videos" else: video_type = "shorts" # Create a new instance of the Edge driver driver = webdriver.Edge() # Open the YouTube channel page driver.get(url) time.sleep(5) last_height = driver.execute_script("return document.documentElement.scrollHeight") # Scroll to the bottom of the page to load all videos while True: # Scroll down to the bottom print("Scrolling down...") driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END) # Check if the page height has increased driver.implicitly_wait(3) time.sleep(3) # Adjust the wait time as needed new_height = driver.execute_script("return document.documentElement.scrollHeight") if new_height == last_height: break # End of page reached last_height = new_height # Get the page source after scrolling page_source = driver.page_source # Close the webdriver driver.quit() # Parse the HTML with BeautifulSoup soup = BeautifulSoup(page_source, 'html.parser') # Find all links with the specified attributes if video_type == "shorts": video_links = soup.find_all('a', {'id': 'thumbnail', 'class': 'yt-simple-endpoint inline-block style-scope ytd-thumbnail'}) elif video_type == "videos": video_links = soup.find_all('a', {'id': 'thumbnail', 'class': 'yt-simple-endpoint style-scope ytd-playlist-thumbnail'}) # Extract and print the href attributes channel = url.split('/')[-2] with open(f'{channel}-{video_type}.txt', 'w') as f: for link in video_links: href = link.get('href') if href: f.write(f"https://www.youtube.com{href}\n")

In [ ]:

  Copied!     
 
if __name__ == "__main__":
    # Check if a command-line argument is provided
    if len(sys.argv) < 2:
        print("Usage: python youtube_channel_scraper.py <YouTube channel URL> Example: https://www.youtube.com/@Fireship/videos or https://www.youtube.com/@Fireship/shorts")
        sys.exit(1)

    # URL of the YouTube channel
    url = sys.argv[1]

    # Call the main function with the provided URL
    scrape_youtube_channel(url)
if __name__ == "__main__": # Check if a command-line argument is provided if len(sys.argv) < 2: print("Usage: python youtube_channel_scraper.py  Example: https://www.youtube.com/@Fireship/videos or https://www.youtube.com/@Fireship/shorts") sys.exit(1) # URL of the YouTube channel url = sys.argv[1] # Call the main function with the provided URL scrape_youtube_channel(url)