gohorsejobs/job-scraper-multisite/scrapers/geekhunter_scraper.py
Tiago Yamamoto a4abcf8e05 feat: SEO optimization and dynamic jobs API integration
Backend:
- Add Swagger annotations to all job handlers (GET, POST, PUT, DELETE)
- Clean up job handler code

Frontend:
- Expand api.ts with ApiJob types, pagination, and transform function
- Update footer with 'Vagas por Tecnologia' SEO links
- Add robots.txt with crawler directives
- Add sitemap.xml with main pages and job URLs
- Change branding to GoHorse Jobs
2025-12-14 09:16:44 -03:00

89 lines
3.2 KiB
Python

"""
Scraper para GeekHunter - https://www.geekhunter.com.br/vagas
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
# Headers para simular navegador e evitar bloqueios
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
}
def scrape_geekhunter(delay: float = 2.0) -> pd.DataFrame:
"""
Raspa vagas do site GeekHunter.
Args:
delay: Tempo de espera antes da requisição (anti-bloqueio)
Returns:
DataFrame com colunas: titulo, empresa, localizacao, link
"""
url = "https://www.geekhunter.com.br/vagas"
vagas = []
try:
# Delay anti-bloqueio
time.sleep(delay)
print(f"🔍 Raspando vagas de: {url}")
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Encontrar cards de vagas (ajustar seletores conforme estrutura do site)
job_cards = soup.select('.job-card') or soup.select('[class*="job"]') or soup.select('article')
for card in job_cards:
try:
# Extrair título
titulo_elem = card.select_one('h2') or card.select_one('h3') or card.select_one('.title')
titulo = titulo_elem.get_text(strip=True) if titulo_elem else "N/A"
# Extrair empresa
empresa_elem = card.select_one('.company') or card.select_one('[class*="company"]')
empresa = empresa_elem.get_text(strip=True) if empresa_elem else "N/A"
# Extrair localização
loc_elem = card.select_one('.location') or card.select_one('[class*="location"]')
localizacao = loc_elem.get_text(strip=True) if loc_elem else "Remoto"
# Extrair link
link_elem = card.select_one('a[href*="/vagas/"]') or card.select_one('a')
if link_elem:
href = link_elem.get('href', '')
link = f"https://www.geekhunter.com.br{href}" if href.startswith('/') else href
else:
link = url
vagas.append({
'titulo': titulo,
'empresa': empresa,
'localizacao': localizacao,
'link': link,
'fonte': 'GeekHunter'
})
except Exception as e:
print(f"⚠️ Erro ao processar card: {e}")
continue
print(f"{len(vagas)} vagas encontradas no GeekHunter")
except requests.exceptions.RequestException as e:
print(f"❌ Erro na requisição ao GeekHunter: {e}")
except Exception as e:
print(f"❌ Erro inesperado no GeekHunter: {e}")
return pd.DataFrame(vagas)
if __name__ == "__main__":
# Teste individual do scraper
df = scrape_geekhunter()
print(df.head())