#!/usr/bin/env python3 """ PTV Multi-Channel EPG Scraper - Enhanced Version - Real-time progress output - Comprehensive time parsing - Robust error handling - All original functionality preserved """ import sys import signal import time import threading import os import re from datetime import datetime, timedelta from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import xml.etree.ElementTree as ET from xml.dom.minidom import parseString import pytz from pathlib import Path from collections import defaultdict import undetected_chromedriver as uc import random # ======================== CONFIGURATION ======================== OUTPUT_FILENAME = "ptv_epg.xml" DEFAULT_PROGRAM_DURATION = { 'sports': timedelta(hours=4), 'default': timedelta(hours=1) } ABBREVIATIONS = { # Sports abbreviations 'ICC', 'EPL', 'IPL', 'PCB', 'FIFA', 'UEFA', 'NBA', 'NFL', 'NHL', 'MLB', 'PGA', 'US', 'UK', 'USA', 'T20', 'ODI', 'PSL', 'FIA', 'F1', 'ATP', 'WTA', 'WTC', 'UFC', 'CFL', 'PZ', 'KK', 'LQ','QG','MS','IU','IL', 'GG', 'DV','TRI','WI','NZ','PK','PAK','IN','SL','BD','BG','EN','SA','US','USA','UK', 'AU','AUS','SF','H/L','T20I' # Home abbreviations 'PTV', 'BBC', 'CNN', 'PBC', 'EID', 'EP', 'NO', 'PROG', 'QURAN', 'NAAT', 'HADITH', 'PAKISTAN', 'AM', 'PM', 'EID','AZAN','EP','CA','PAK', # World abbreviations 'R', 'RPT', 'HEADLINES', 'AL QURAN', 'COVID', 'COVID-19', 'UAE', 'UN', 'WHO', 'WTO', 'IMF', 'EU', 'NATO', 'UNSC', 'LIVE', # News abbreviations 'PTI', 'AFP', 'AP', 'UNESCO', # Global abbreviations 'S.A.W', 'DC', 'TV', 'F', # National abbreviations 'NN', 'AIOU', 'SINDHI', 'SINDH', 'AL-QURAN', # Bolan abbreviations 'MN', 'BRAHVI', 'BALOCHI', 'PASHTO', 'QURAAN', 'IQRA', 'AZAN', 'MAGHRIB', 'FAJAR', 'SEHAR', 'DAROOD', 'IBRAHIM' } CHANNELS = { 'sports': { 'id': 'PTV SPORTS.pk', 'name': 'PTV Sports', 'url': 'https://sports.ptv.com.pk/tvguidemaster', 'duration': 'sports' }, 'home': { 'id': 'PTV HOME.pk', 'name': 'PTV Home', 'url': 'https://ptv.com.pk/ptvhome/tvguide', 'duration': 'default' }, 'world': { 'id': 'PTV WORLD.pk', 'name': 'PTV World', 'url': 'https://ptv.com.pk/ptvworld/tvguide', 'duration': 'default' }, 'news': { 'id': 'PTV NEWS.pk', 'name': 'PTV News', 'url': 'https://ptv.com.pk/ptvnews/tvguide', 'duration': 'default' }, 'global': { 'id': 'PTV GLOBAL.pk', 'name': 'PTV Global', 'url': 'https://ptv.com.pk/ptvglobal/tvguide', 'duration': 'default' }, 'national': { 'id': 'PTV NATIONAL.pk', 'name': 'PTV National', 'url': 'https://ptv.com.pk/ptvnational/tvguide', 'duration': 'default' }, 'bolan': { 'id': 'PTV BOLAN.pk', 'name': 'PTV Bolan', 'url': 'https://ptv.com.pk/ptvbolan/tvguide', 'duration': 'default' } } # ======================== UTILITY FUNCTIONS ======================== def handler(signum, frame): print("\nâšī¸ Script terminated by user") sys.exit(0) signal.signal(signal.SIGINT, handler) def countdown_timer(): """Display countdown timer before automatic exit""" for i in range(5, 0, -1): print(f"\rWindow will close in {i} seconds... (Press Enter to exit now)", end="") time.sleep(1) print("\nClosing program automatically...") os._exit(0) def print_channel_header(channel_name): """Print consistent channel header""" print(f"\n{'='*50}") print(f"📡 Scraping {CHANNELS[channel_name]['name']}") print(f"🌐 URL: {CHANNELS[channel_name]['url']}") def print_day_results(day_name, programs, week_dates): """Print day's results with date and end times""" date_str = week_dates[day_name].strftime('%a %d %b') print(f"\n📅 {day_name} ({date_str}):") for i, program in enumerate(programs, 1): start_str = program['start'].strftime('%H:%M') end_str = program['end'].strftime('%H:%M') if program['end'] else '--:--' print(f" {i}. {start_str}-{end_str} | {program['title']}") print(f"✔ Found {len(programs)} programs") def format_title(title): """Improved title formatting with fixes for parentheses and specific time parsing""" # Protect all abbreviations (they should remain ALL CAPS) protected = {} for abbr in ABBREVIATIONS: if abbr in title.upper(): # Find the exact case match in the original title matches = re.finditer(re.escape(abbr), title, re.IGNORECASE) for match in matches: protected[match.group()] = match.group() # Convert to Title Case while handling parentheses words = re.split(r'(\()', title) # Split on parentheses to handle them separately formatted_parts = [] for part in words: if part == '(': formatted_parts.append(part) elif part: # Process each word separately sub_words = part.split() formatted_sub_words = [] for word in sub_words: if word in protected: formatted_sub_words.append(word) else: # Convert to Title Case while preserving hyphenated words if '-' in word: parts = word.split('-') formatted_word = '-'.join([p.capitalize() for p in parts]) else: formatted_word = word.capitalize() formatted_sub_words.append(formatted_word) formatted_parts.append(' '.join(formatted_sub_words)) # Reconstruct the title formatted_title = ''.join(formatted_parts) # Handle special cases that should remain ALL CAPS special_allcaps = ['LIVE', 'REPEAT', 'TELECAST', 'SPECIAL', 'PROG'] words = formatted_title.split() for i, word in enumerate(words): if word.upper() in special_allcaps: words[i] = word.upper() return ' '.join(words) def parse_time(time_str, channel=None): """Time parser with enhanced PTV Global multi-timezone handling""" if not time_str or not isinstance(time_str, str): print(f"âš ī¸ Empty/invalid time string for channel {channel}") return None original_time = time_str.strip() time_str = original_time.upper() # ===== Special Handling for PTV Global ===== if channel == 'global': # Handle all variants: PST, ST, PSA, and cases with multiple UK times time_match = re.search( r'(?P