Agent/AuctionServices/Scraper/Drouot/DrouotData.js

394 lines
12 KiB
JavaScript

// interencheresData.js
const {ScraperTools} = require('../Scraper');
const urlModule = require('url');
const moment = require('moment-timezone');
const { Console } = require('console');
const { title } = require('process');
class DrouotData extends ScraperTools {
_Name = 'drouot'
_LiveData = null
getUrlInfo = async (url) => {
// URL Lot : https://drouot.com/fr/l/25184163-john-conde-17651794-britanniqu
// https://drouot.com/fr/{{v: Vente/ l: Lot}}/{{LotID}}-john-conde-{{????}}-britanniqu
// URL Sale : https://drouot.com/fr/v/152658-fine-paintings-and-frames
// https://drouot.com/fr/{{v: Vente/ l: Lot}}/{{SaleID}}-fine-paintings-and-frames
let parsedUrl = new urlModule.URL(url);
let pathParts = parsedUrl.pathname.split('/').filter(Boolean);
// if sale URL
let saleID = 0
let lotID = 0
let TypeUrl = ''
let urlSale = ''
let urlLot = ''
if(pathParts[1] == 'v'){
TypeUrl = 'Sale'
saleID = pathParts[2].split('-')[0];
urlSale = parsedUrl.origin + parsedUrl.pathname
}else if(pathParts[1] == 'l'){
TypeUrl = 'Lot'
lotID = pathParts[2].split('-')[0];
urlLot = parsedUrl.origin + parsedUrl.pathname
}
return {
'TypeUrl': TypeUrl,
'saleID': saleID,
'lotID': lotID,
'urlSale': urlSale,
'urlLot': urlLot
}
}
// ## Lot
getPictures = async (page, Url) => {
console.log("getPictures "+this._Name+": "+Url)
//add the picture to the list
let PictList = []
page.on('response', async response => {
const url = response.url();
if (response.request().resourceType() === 'image' && url.match("size=fullHD")) {
response.buffer().then(file => {
console.log("push "+url)
PictList.push(url)
});
}
});
function checkDup() {
const toFindDuplicates = array => array.filter((item, index) => array.indexOf(item) !== index)
const duplicateElements = toFindDuplicates(PictList);
// if dupplicate pictures added in the array
if (duplicateElements.length > 0) {
// remove diplucated content
PictList = PictList.filter(function (elem, pos) {
return PictList.indexOf(elem) == pos;
})
// stop the process
return false
// no dupplicated picture
} else {
// continue the process
return true
}
}
// Navigate the page to a URL
await page.goto(Url);
await page.waitForTimeout(500);
//get the Next link
const [ButtonNext] = await page.$x("//*[@id='next']");
if (ButtonNext) {
let condition = true
do {
console.log("click")
await ButtonNext.evaluate(b => b.click());
await page.waitForTimeout(500);
condition = checkDup();
} while (condition);
}
return PictList
}
getLotNumber = async (page) => {
const lotNumberXPath = [
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[1]/span/span',
]
let lotNumberString = await this.getTextContent(lotNumberXPath, page, 'lotNumberXPath')
let lotNumber = '';
if(lotNumberString != ''){
lotNumber = lotNumberString.replace('Lot ', '');
}
return lotNumber
}
getLotTitle = async (page) => {
const lotTitleXPath = [
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[5]/h1',
]
let lotTitleString = await this.getTextContent(lotTitleXPath, page, 'lotTitleXPath')
if (lotTitleString.length > 90) {
lotTitleString = lotTitleString.substring(0, 90) + '...';
}
return lotTitleString
}
getEstimate = async (page) => {
const EstimateXPath = [
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[7]/div/div[1]/span/span',
]
let EstimateString = await this.getTextContent(EstimateXPath, page, 'EstimateXPath')
//console.log('EstimateString : '+EstimateString)
let EstimateLow = 0
let EstimateHigh = 0
if(EstimateString != ''){
let matches = EstimateString.match(/(\d{1,3}(?:\s\d{3})*)/g);
if (matches) {
if (matches.length >= 2) {
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
EstimateHigh = parseInt(matches[1].replace(/\s/g, ''), 10);
console.log('Low:', EstimateLow);
console.log('High:', EstimateHigh);
} else
if(matches.length == 1){
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
EstimateHigh = 0;
} else {
console.error('Could not extract numbers.');
}
}
}
return {EstimateLow, EstimateHigh}
}
getDescription = async (page) => {
const DescriptionXPath = [
'//h3[contains(@class, "descriptionLineWrap")]',
]
let Description = await this.getTextContent(DescriptionXPath, page, 'DescriptionXPath')
return Description
}
getFees = async (page) => {
let feesText = ''
let fees = 0
const FeesXPath = [
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[7]/div/div[3]/a/span',
]
feesText = await this.getTextContent(FeesXPath, page, 'FeesXPath')
// detect digit
if (!/\d/.test(feesText)) {
const FeesXPath = [
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[7]/div/div[2]/a/span',
]
feesText = await this.getTextContent(FeesXPath, page, 'FeesXPath')
}
feesText = feesText.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
let matches = feesText.match(/(\d+(\.\d+)?)/)
if (matches) {
fees = matches[0];
}
return {feesText, fees}
}
getLotID = async (url) => {
let UrlInfo = await this.getUrlInfo(url);
let id_lot = UrlInfo.lotID
return id_lot
}
getSaleID = async (page) => {
const UrlCatalogueXPath = [
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[10]/div/div[2]/div[3]/a[1]',
]
let UrlCatalogue = await this.getAttribute(UrlCatalogueXPath, page, "href", "UrlCatalogueXPath")
console.log('UrlCatalogue : '+UrlCatalogue)
UrlCatalogue = UrlCatalogue.substring(0,1) == '/' ? 'https://drouot.com'+UrlCatalogue : UrlCatalogue
let UrlInfo = await this.getUrlInfo(UrlCatalogue);
let id_sale = UrlInfo.saleID
let urlSale = UrlInfo.urlSale
return {id_sale, urlSale}
}
// ## Lot List
_getLotInfoList = async (page, Elements) => {
let LotList = [];
for (let element of Elements) {
let Lot = {}
try{
let LotnameXPath = [
'.//a/div/div/div[5]/div/div[1]',
'.//a/div/div/div[4]/div/div[1]',
]
let Lotname = await this.getTextContentElement(LotnameXPath, page, element, 'LotnameXPath')
// idPlatform from the url
let LotUrlXPath = [
'.//a'
]
let urlLot = await this.getAttributeElement(LotUrlXPath, page, element, 'href', 'UrlListLot')
let match = urlLot.match(/lot-(.*).html/);
let idPlatform = match[1];
Lot = {
title: Lotname,
idPlatform: idPlatform,
platform: this._Name,
lotNumber: Lotname.split('Lot ')[1]
}
}catch(e){
console.error(e)
}
//console.log(LotList)
LotList.push(Lot);
};
return LotList;
}
getLotList = async (page) => {
let LotList = []
let NextBtn = false
do {
// extract Lot List
const LotListXPath = [
'//div[contains(@class, "sale-item-wrapper")]',
]
let Elements = await page.$x(LotListXPath[0]);
if (Elements.length > 0) {
LotList = [].concat(LotList, await this._getLotInfoList(page, Elements))
}
// search for the Button Next (only if enabled)
let NextPageButtonXPath = "//button[contains(@aria-label, 'Page suivante') and not(contains(@class, 'v-pagination__navigation--disabled'))]"
let NextPageButton = await page.$x(NextPageButtonXPath);
if (NextPageButton.length > 0) {
NextBtn = true
await NextPageButton[0].evaluate(b => b.click());
await page.waitForTimeout(1000);
console.log('Next Page')
}else{
NextBtn = false
}
} while (NextBtn);
return LotList
}
// ## Sale
getSaleTitle = async (page) => {
const SaleTitleXPath = [
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/h1',
]
let SaleTitle = await this.getTextContent(SaleTitleXPath, page, 'SaleTitleXPath')
return SaleTitle
}
getSaleDate = async (page) => {
// Test if Live Sale
let BoolLive = false;
try {
// const VideoXPath = [
// '//*[@id="streaming-subscriber"]',
// ]
// let VideoExists = await this.ElementExists(VideoXPath, page, 'VideoXPath')
// console.log('VideoExists : '+VideoExists)
// BoolLive = VideoExists
} catch (error) {}
let SaleDate;
// if futur sale
if(!BoolLive){
await page.waitForTimeout(400);
const SaleDateXPath = [
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/div[1]/div',
]
let SaleDateString = await this.getTextContent(SaleDateXPath, page, 'SaleDateXPath')
SaleDateString = SaleDateString.trim()
let cleanStr = SaleDateString.replace(/\\s|\\n/g, ' ').replace(/\s+/g, ' ');
SaleDate = moment.tz(cleanStr, 'dddd D MMMM à HH:mm (z)', 'fr', 'Europe/Paris').format();
// Live Sale
}else{
SaleDate = moment.tz('Europe/Paris').format();
}
console.log('SaleDate : '+SaleDate)
return SaleDate
}
getSaleLocation = async (page) => {
const SaleLocationXPath = [
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/div[4]',
]
let SaleLocation = await this.getTextContent(SaleLocationXPath, page, 'SaleLocationXPath')
return SaleLocation.trim()
}
getSaleHouseName = async (page) => {
const SaleHouseNameXPath = [
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/h4/a[1]/span',
]
let SaleHouseName = await this.getTextContent(SaleHouseNameXPath, page, 'SaleHouseNameXPath')
SaleHouseName = SaleHouseName.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
return SaleHouseName.trim()
}
// ## Live Data
setLiveData = (Data) => {
this._LiveData = Data
}
getLiveDataLot = async(lotId) => {
for (let lot of this._LiveData.lots) {
if (lot.id === lotId) {
return lot;
}
}
}
}
module.exports = DrouotData