gohorsejobs/JobScraper_MultiSite/scrapers/programathor_scraper.py
Tiago Yamamoto 8856357acd feat: add JobScraper_MultiSite Python project
- main_scraper.py: Main entry point, consolidates data from all sources
- scrapers/programathor_scraper.py: Scraper for ProgramaThor
- scrapers/geekhunter_scraper.py: Scraper for GeekHunter
- requirements.txt: Python dependencies (requests, beautifulsoup4, pandas)
- README.md: Documentation with usage instructions
- Modular architecture for easy addition of new sites
2025-12-14 09:10:17 -03:00

89 lines
3.1 KiB
Python

"""
Scraper para ProgramaThor - https://programathor.com.br/jobs
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
# Headers para simular navegador e evitar bloqueios
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
}
def scrape_programathor(delay: float = 2.0) -> pd.DataFrame:
"""
Raspa vagas do site ProgramaThor.
Args:
delay: Tempo de espera antes da requisição (anti-bloqueio)
Returns:
DataFrame com colunas: titulo, empresa, localizacao, link
"""
url = "https://programathor.com.br/jobs"
vagas = []
try:
# Delay anti-bloqueio
time.sleep(delay)
print(f"🔍 Raspando vagas de: {url}")
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Encontrar cards de vagas (ajustar seletores conforme estrutura do site)
job_cards = soup.select('.cell-list')
for card in job_cards:
try:
# Extrair título
titulo_elem = card.select_one('h3') or card.select_one('.title')
titulo = titulo_elem.get_text(strip=True) if titulo_elem else "N/A"
# Extrair empresa
empresa_elem = card.select_one('.company-name') or card.select_one('h4')
empresa = empresa_elem.get_text(strip=True) if empresa_elem else "N/A"
# Extrair localização
loc_elem = card.select_one('.location') or card.select_one('.info')
localizacao = loc_elem.get_text(strip=True) if loc_elem else "Remoto"
# Extrair link
link_elem = card.select_one('a[href*="/jobs/"]')
if link_elem:
href = link_elem.get('href', '')
link = f"https://programathor.com.br{href}" if href.startswith('/') else href
else:
link = url
vagas.append({
'titulo': titulo,
'empresa': empresa,
'localizacao': localizacao,
'link': link,
'fonte': 'ProgramaThor'
})
except Exception as e:
print(f"⚠️ Erro ao processar card: {e}")
continue
print(f"{len(vagas)} vagas encontradas no ProgramaThor")
except requests.exceptions.RequestException as e:
print(f"❌ Erro na requisição ao ProgramaThor: {e}")
except Exception as e:
print(f"❌ Erro inesperado no ProgramaThor: {e}")
return pd.DataFrame(vagas)
if __name__ == "__main__":
# Teste individual do scraper
df = scrape_programathor()
print(df.head())