import os import cloudscraper from bs4 import BeautifulSoup from datetime import datetime, timedelta from openpyxl import Workbook, load_workbook from openpyxl.styles import PatternFill, Font import time import pytz # === Setup === EXCEL_FILE = 'Historic News Data.xlsx' START_DATE = datetime(2024, 7, 1) END_DATE = datetime(2025, 4, 22) HEADERS = [ "Date", "Time", "DateTime", "Currency", "Event", "Actual", "Forecast", "Difference", "Result" ] # === Colors === FILL_GREEN = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") FILL_RED = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") HEADER_FONT = Font(bold=True, size=12) NORMAL_FONT = Font(size=11) # === Initialize scraper === scraper = cloudscraper.create_scraper() # === Excel Setup === def init_workbook(): if not os.path.exists(EXCEL_FILE): wb = Workbook() ws = wb.active ws.title = "All" ws.append(HEADERS) for col in range(1, len(HEADERS) + 1): ws.cell(row=1, column=col).font = HEADER_FONT wb.save(EXCEL_FILE) def clean_value(val): if isinstance(val, str): val = val.replace(',', '').replace('K', '000').replace('M', '000000') val = val.replace('%', '') return float(val) if val else 0.0 def calculate_percentage_diff(actual, forecast): try: actual_num = clean_value(actual) forecast_num = clean_value(forecast) if forecast_num == 0: return 0.0 return ((actual_num - forecast_num) / forecast_num) * 100 except (ValueError, TypeError): return None def get_result_from_classes(row): actual_cell = row.find("td", class_="calendar__actual") if not actual_cell: return "Neutral" span = actual_cell.find("span") if span and span.has_attr("class"): classes = span["class"] if "better" in classes: return "Better" elif "worse" in classes: return "Worse" row_classes = row.get("class", []) if "better" in row_classes: return "Better" elif "worse" in row_classes: return "Worse" return "Neutral" def get_fill_for_result(result): if result == "Better": return FILL_GREEN elif result == "Worse": return FILL_RED return None def style_headers(sheet): for col in range(1, len(HEADERS) + 1): sheet.cell(row=1, column=col).font = HEADER_FONT def parse_and_save(html, target_date): soup = BeautifulSoup(html, 'html.parser') rows = soup.find_all("tr", class_="calendar__row") parsed_rows = [] current_time_str = None local_zone = pytz.timezone("Asia/Karachi") # Your system/browser shows times in Pakistan time target_zone = pytz.FixedOffset(180) # IC Markets server time (UTC+3) for row in rows: time_cell = row.find("td", class_="calendar__time") time_text = time_cell.text.strip() if time_cell else "" if time_text and time_text.lower() != "all day": current_time_str = time_text if not current_time_str: continue impact_cell = row.find("td", class_="calendar__impact") if not impact_cell: continue impact_icon = impact_cell.find("span") if not impact_icon or 'icon--ff-impact-red' not in impact_icon.get("class", []): continue try: naive_dt = datetime.strptime(f"{target_date.strftime('%Y-%m-%d')} {current_time_str}", "%Y-%m-%d %I:%M%p") localized_dt = local_zone.localize(naive_dt) server_dt = localized_dt.astimezone(target_zone) except Exception: continue currency_tag = row.find("td", class_="calendar__currency") event_tag = row.find("td", class_="calendar__event") actual_tag = row.find("td", class_="calendar__actual") forecast_tag = row.find("td", class_="calendar__forecast") if not currency_tag or not event_tag: continue currency = currency_tag.text.strip() event = event_tag.text.strip() actual = actual_tag.text.strip() if actual_tag else "" forecast = forecast_tag.text.strip() if forecast_tag else "" if actual in ["", "-", "—"]: continue percentage_diff = calculate_percentage_diff(actual, forecast) result = get_result_from_classes(row) row_data = [ server_dt.strftime("%Y-%m-%d"), server_dt.strftime("%H:%M"), server_dt.strftime("%Y-%m-%d %H:%M"), currency, event, actual, forecast, f"{percentage_diff:.2f}%" if percentage_diff is not None else "", result ] parsed_rows.append((row_data, result)) if parsed_rows: wb = load_workbook(EXCEL_FILE) def write_row(sheet, row_data, result): ws = wb[sheet] row = ws.max_row + 1 for col, value in enumerate(row_data, start=1): cell = ws.cell(row=row, column=col, value=value) cell.font = NORMAL_FONT if col == 9: fill = get_fill_for_result(result) if fill: cell.fill = fill if "All" not in wb.sheetnames: ws = wb.create_sheet("All") ws.append(HEADERS) style_headers(ws) for row_data, result in parsed_rows: write_row("All", row_data, result) currency = row_data[3] if currency not in wb.sheetnames: currency_ws = wb.create_sheet(currency) currency_ws.append(HEADERS) style_headers(currency_ws) write_row(currency, row_data, result) wb.save(EXCEL_FILE) print(f"āœ… Saved {len(parsed_rows)} items for {target_date.strftime('%b %d')}") else: print(f"ā„¹ļø No high-impact items for {target_date.strftime('%b %d')}") # === Main === init_workbook() current_date = END_DATE while current_date >= START_DATE: date_str = current_date.strftime("%b%d.%Y") url = f"https://www.forexfactory.com/calendar?day={date_str}" print(f"\nšŸ“… Scraping: {current_date.strftime('%Y-%m-%d')} -> {url}") try: response = scraper.get(url, timeout=10) if response.status_code == 200: parse_and_save(response.text, current_date) else: print(f"āŒ HTTP {response.status_code} for {url}") except Exception as e: print(f"āŒ Error fetching {url}: {str(e)}") current_date -= timedelta(days=1) time.sleep(1) print("\nāœ… All done!")