YOUTUBE_API_KEY=cheia_ta_aici

from pathlib import Path
import os
import json
import requests
from datetime import datetime
from dotenv import load_dotenv

ROOT = Path.cwd()
while not (ROOT / ".env").exists() and ROOT.parent != ROOT:
    ROOT = ROOT.parent
load_dotenv(ROOT / ".env")
API_KEY = os.getenv("YOUTUBE_API_KEY")
BASE_URL = "https://www.googleapis.com/youtube/v3"
print("Root proiect:", ROOT)
print("Cheie găsită:", API_KEY is not None)

Root proiect: c:\PROJECTS\echochamber-app
Cheie găsită: True

student_id = "student_01"
handle = "digi24hd56"
max_videos = 2
max_comments_per_video = 100
output_file = ROOT / "data" / "raw" / f"{student_id}_youtube_raw.jsonl"
print(output_file)

c:\PROJECTS\echochamber-app\data\raw\student_01_youtube_raw.jsonl

channel_response = requests.get(
    f"{BASE_URL}/channels",
    params={
        "part": "id",
        "forHandle": handle,
        "key": API_KEY
    }
)
channel_data = channel_response.json()
channel_data

{'kind': 'youtube#channelListResponse',
 'etag': 'bPGHk7xg5axBT4a3koCcnNny28s',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'BhuTK97GknHy20Igr-aKGXPuJdU',
   'id': 'UCbvKamSrJkwT6ed2BMMZXwg'}]}

channel_id = channel_data["items"][0]["id"]
channel_id

'UCbvKamSrJkwT6ed2BMMZXwg'

videos_response = requests.get(
    f"{BASE_URL}/search",
    params={
        "part": "snippet",
        "channelId": channel_id,
        "type": "video",
        "order": "date",
        "maxResults": max_videos,
        "key": API_KEY
    }
)
videos_data = videos_response.json()
videos_data["items"][0]

{'kind': 'youtube#searchResult',
 'etag': 'pW-oMrmuV_wk35JIH55B1oWoQJs',
 'id': {'kind': 'youtube#video', 'videoId': 'Fyk8Ob7CRjw'},
 'snippet': {'publishedAt': '2026-05-04T09:51:48Z',
  'channelId': 'UCbvKamSrJkwT6ed2BMMZXwg',
  'title': '🟣 Știrile Digi24 de la ora 12 – 4 mai 2026',
  'description': 'Știrile Digi24 de la ora 12 – 4 mai 2026 ➥ Pentru mai multe știri vizitează site-ul Digi24 - https://www.digi24.ro/ ➥ Abonează-te la ...',
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/Fyk8Ob7CRjw/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/Fyk8Ob7CRjw/mqdefault.jpg',
    'width': 320,
    'height': 180},
   'high': {'url': 'https://i.ytimg.com/vi/Fyk8Ob7CRjw/hqdefault.jpg',
    'width': 480,
    'height': 360}},
  'channelTitle': 'Digi24HD',
  'liveBroadcastContent': 'none',
  'publishTime': '2026-05-04T09:51:48Z'}}

videos = []
for item in videos_data["items"]:
    videos.append({
        "video_id": item["id"]["videoId"],
        "video_title": item["snippet"]["title"],
        "video_date": item["snippet"]["publishedAt"][:10]
    })
videos

[{'video_id': 'Fyk8Ob7CRjw',
  'video_title': '🟣 Știrile Digi24 de la ora 12 – 4 mai 2026',
  'video_date': '2026-05-04'},
 {'video_id': 'VV9sV-eBVeA',
  'video_title': '#PetStory: Povestea cățelușei Eli #digi24',
  'video_date': '2026-05-04'}]

comments = []
for video in videos:
    print("Colectez:", video["video_title"][:80])
    comments_response = requests.get(
        f"{BASE_URL}/commentThreads",
        params={
            "part": "snippet",
            "videoId": video["video_id"],
            "maxResults": max_comments_per_video,
            "textFormat": "plainText",
            "order": "relevance",
            "key": API_KEY
        }
    )
    comments_data = comments_response.json()
    for comment_item in comments_data.get("items", []):
        snippet = comment_item["snippet"]["topLevelComment"]["snippet"]
        record = {
            "id": f"yt_{video['video_id']}_{comment_item['id']}",
            "source_platform": "youtube",
            "source_channel": handle,
            "text_raw": snippet["textDisplay"],
            "video_id": video["video_id"],
            "video_title": video["video_title"],
            "video_date": video["video_date"],
            "comment_date": snippet["publishedAt"][:10],
            "likes": snippet["likeCount"],
            "collected_at": datetime.utcnow().strftime("%Y-%m-%d")
        }
        comments.append(record)
len(comments)

Colectez: 🟣 Știrile Digi24 de la ora 12 – 4 mai 2026

C:\Users\alexe\AppData\Local\Temp\ipykernel_29276\3206550858.py:28: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
  "collected_at": datetime.utcnow().strftime("%Y-%m-%d")

Colectez: #PetStory: Povestea cățelușei Eli #digi24

2

comments[:3]

[{'id': 'yt_Fyk8Ob7CRjw_Ugwc6D97EMahyFzA7Xx4AaABAg',
  'source_platform': 'youtube',
  'source_channel': 'digi24hd56',
  'text_raw': 'TV  DIGI  24   TV  A  SISTEMULUI   !!!!',
  'video_id': 'Fyk8Ob7CRjw',
  'video_title': '🟣 Știrile Digi24 de la ora 12 – 4 mai 2026',
  'video_date': '2026-05-04',
  'comment_date': '2026-05-04',
  'likes': 0,
  'collected_at': '2026-05-04'},
 {'id': 'yt_Fyk8Ob7CRjw_Ugz3uhxyQ_w2qF4MJdF4AaABAg',
  'source_platform': 'youtube',
  'source_channel': 'digi24hd56',
  'text_raw': 'Auristule ați pierdut din start facind o coaliție cu PSD  va compromis o sa va invită psd',
  'video_id': 'Fyk8Ob7CRjw',
  'video_title': '🟣 Știrile Digi24 de la ora 12 – 4 mai 2026',
  'video_date': '2026-05-04',
  'comment_date': '2026-05-04',
  'likes': 0,
  'collected_at': '2026-05-04'}]

comments[0].keys()

dict_keys(['id', 'source_platform', 'source_channel', 'text_raw', 'video_id', 'video_title', 'video_date', 'comment_date', 'likes', 'collected_at'])

import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)      # elimină linkuri
    text = re.sub(r"\s+", " ", text)         # normalizează spațiile
    return text.strip()

for comment in comments:
    comment["text"] = clean_text(comment["text_raw"])

comments[0]

{'id': 'yt_Fyk8Ob7CRjw_Ugwc6D97EMahyFzA7Xx4AaABAg',
 'source_platform': 'youtube',
 'source_channel': 'digi24hd56',
 'text_raw': 'TV  DIGI  24   TV  A  SISTEMULUI   !!!!',
 'video_id': 'Fyk8Ob7CRjw',
 'video_title': '🟣 Știrile Digi24 de la ora 12 – 4 mai 2026',
 'video_date': '2026-05-04',
 'comment_date': '2026-05-04',
 'likes': 0,
 'collected_at': '2026-05-04',
 'text': 'TV DIGI 24 TV A SISTEMULUI !!!!'}

MIN_CHARS = 60

comments_clean = [
    comment for comment in comments
    if len(comment["text"]) >= MIN_CHARS
]

print("Comentarii brute:", len(comments))
print("Comentarii după filtrarea lungimii:", len(comments_clean))

Comentarii brute: 2
Comentarii după filtrarea lungimii: 1

MIN_ALPHA = 0.5

def alpha_ratio(text):
    if len(text) == 0:
        return 0
    letters = sum(char.isalpha() for char in text)
    return letters / len(text)

comments_clean = [
    comment for comment in comments_clean
    if alpha_ratio(comment["text"]) >= MIN_ALPHA
]

print("Comentarii după filtrarea literelor:", len(comments_clean))

Comentarii după filtrarea literelor: 1

seen_texts = set()
unique_comments = []

for comment in comments_clean:
    text = comment["text"].lower()
    if text not in seen_texts:
        unique_comments.append(comment)
        seen_texts.add(text)

comments_clean = unique_comments

print("Comentarii finale după deduplicare:", len(comments_clean))

Comentarii finale după deduplicare: 1

clean_output_file = ROOT / "data" / "cleaned" / f"{student_id}_youtube_clean.jsonl"
clean_output_file.parent.mkdir(parents=True, exist_ok=True)

with clean_output_file.open("w", encoding="utf-8") as f:
    for comment in comments_clean:
        f.write(json.dumps(comment, ensure_ascii=False) + "\n")

print("Comentarii curate salvate:", len(comments_clean))
print("Fișier:", clean_output_file)

Comentarii curate salvate: 1
Fișier: c:\PROJECTS\echochamber-app\data\cleaned\student_01_youtube_clean.jsonl

import re

def clean_comments(comments, min_chars=60, min_alpha=0.5):
    cleaned = []
    seen_texts = set()
    
    for comment in comments:
        # 1. Curățare text
        text = comment["text_raw"]
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        
        # 2. Filtru lungime
        if len(text) < min_chars:
            continue
        
        # 3. Filtru proporție litere
        letters = sum(char.isalpha() for char in text)
        alpha_ratio = letters / len(text) if len(text) > 0 else 0
        
        if alpha_ratio < min_alpha:
            continue
        
        # 4. Filtru duplicate
        text_key = text.lower()
        if text_key in seen_texts:
            continue
        
        seen_texts.add(text_key)
        
        # 5. Păstrăm comentariul și adăugăm textul curățat
        new_comment = comment.copy()
        new_comment["text"] = text
        new_comment["lang"] = "ro"
        cleaned.append(new_comment)
    
    return cleaned

comments_clean = clean_comments(
    comments,
    min_chars=60,
    min_alpha=0.5
)

print("Comentarii brute:", len(comments))
print("Comentarii curate:", len(comments_clean))

Comentarii brute: 2
Comentarii curate: 1

for comment in comments_clean[:3]:
    print("RAW:", comment["text_raw"])
    print("CLEAN:", comment["text"])
    print("---")

RAW: Auristule ați pierdut din start facind o coaliție cu PSD  va compromis o sa va invită psd
CLEAN: Auristule ați pierdut din start facind o coaliție cu PSD va compromis o sa va invită psd
---

clean_output_file = ROOT / "data" / "cleaned" / f"{student_id}_youtube_clean.jsonl"
clean_output_file.parent.mkdir(parents=True, exist_ok=True)

with clean_output_file.open("w", encoding="utf-8") as f:
    for comment in comments_clean:
        f.write(json.dumps(comment, ensure_ascii=False) + "\n")

print("Fișier salvat:", clean_output_file)
print("Comentarii salvate:", len(comments_clean))

C3 — Colectare comentarii YouTube¶

1. Ce trebuie să avem pregătit¶

2. Încărcăm cheia API¶

3. Alegem canalul și numărul de videoclipuri¶

4. Găsim canalul YouTube¶

5. Luăm cele mai recente videoclipuri¶

6. Colectăm comentariile¶

Explorare si curatare¶

7. Inspectăm primele comentarii¶

8. Curățare minimă a textului¶

9. Aplicăm curățarea¶

10. Filtrăm comentariile prea scurte¶

11. Filtrăm textele cu prea puține litere¶

12. Eliminăm duplicatele¶

14. Salvăm fișierul curățat¶

Functia de curatare¶