Azure Data Lake Gen 2 & Python copying files within Data Lake folders

160 views Asked by At

Using the samples proved here: https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python?tabs=account-key. I have been able to connect to my Azure Data Lake instance and peruse the directory/file structure. Do some simple operations. All good.
Next step: I have load up a large file into the data lake. Issue: I need two copies of the file in the data lake, one is the As-Received copy and the second may get modified. Want avoid loading the file twice. Was hoping I could load the file once and then copy to the second location.
In the above samples I see how to upload a file, move a file & how to delete a file, but no copy a file.

Your kind assistance is requested.

KD


** New CODE ** New Attempt ** New Exception ** revised my coded based on Googling and browsing

import datetime 
import time
import os, uuid
from azure.identity import DefaultAzureCredential
from azure.storage.blob  import BlobServiceClient, BlobClient, ContainerClient
from azure.storage.blob import ResourceTypes, AccountSasPermissions, BlobSasPermissions
from azure.storage.blob import generate_account_sas , generate_blob_sas
import  azure.core.exceptions


storage_account_name = "XXdatarepos"
storage_account_key = "qvoVHq5NP9EtzKcmH1mm9kXXXXXXXXXXXXXXXXXX**XX**XXAStWgFLpA=="
container_name = "iadata"
local_dir_name  = "C:\XXX\SupplierManagement"
target_dir_name = "Inbox/Test/"
target_dir_name2 = "Repos/SupplierXXX/Data/fooMar9/"
file_name =   "compare_all.xlsx"
local_filePath = os.path.join(local_dir_name, file_name)

target_blob = target_dir_name  +  file_name
target_blob2 = target_dir_name2  +  file_name

account_url = "https://"+ storage_account_name + ".blob.core.windows.net"

# Create the BlobServiceClient object
blob_service_client = BlobServiceClient(account_url, credential=storage_account_key,   connection_verify= False)
containers = blob_service_client.list_containers( name_starts_with="iadata" , include_metadata=True)
container_client = blob_service_client.get_container_client("iadata")   
cc_url  = container_client.url

# Create sas token for blob
sas_resource_types=ResourceTypes(service=True, object=True, container=True)
try:
    sas_token_target = generate_account_sas(  account_name = blob_service_client.account_name, 
                       container_name=container_client.container_name,  
                       account_key = storage_account_key ,  
                       blob_name=target_blob, 
                       resource_types = sas_resource_types,
                       permission= AccountSasPermissions(read=True) ,   expiry = datetime.timedelta(hours=4)  )
except Exception as error:
    print(error)    
    print( type(error).__name__ )
    
    


source_blob_client = BlobClient(account_url = account_url, container_name= container_client.container_name,  blob_name=target_blob, credential = sas_token_target   )
##  ##  source_blob_client = container_client.get_blob_client(target_blob )
##  ##  source_blob_client = BlobClient(account_url = cc_url, blob_name=target_blob, credential = sas_token_target   )

target_blob_client =   container_client.get_blob_client(target_blob2)

target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)

copy_properties = target_blob_client.get_blob_properties().copy

if copy_properties.status != "success":
    target_blob_client.abort_copy(copy_properties.id)
    raise Exception(
            f"Unable to copy blob %s with status %s"
            % (target_blob, copy_properties.status)
    )

** ** New Exception ** ** The warning about InsecureRequestWarning is new

ipdb> C:\Users\ne098406\.conda\envs\python_3.7_XXX\lib\site-packages\urllib3\connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
  InsecureRequestWarning)
azure.core.exceptions.ResourceNotFoundError: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:3d2343a2-001e-00a6-5d2c-7ae40b000000
Time:2024-03-19T18:40:46.7150943Z
ErrorCode:CannotVerifyCopySource
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>CannotVerifyCopySource</Code><Message>Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:3d2343a2-001e-00a6-5d2c-7ae40b000000
Time:2024-03-19T18:40:46.7150943Z</Message></Error>
None
> c:\kbd\testazure\testblobcopymar18.py(57)<module>()
     55 target_blob_client =   container_client.get_blob_client(target_blob2)
     56 
---> 57 target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
     58 
     59 copy_properties = target_blob_client.get_blob_properties().copy


ipdb> --Return--
None
> c:\kbd\testazure\testblobcopymar18.py(57)<module>()
     55 target_blob_client =   container_client.get_blob_client(target_blob2)
     56 
---> 57 target_blob_client.start_copy_from_url(source_blob_client.url, requires_sync=True)
     58 
     59 copy_properties = target_blob_client.get_blob_properties().copy 

The source_blob_client.url is below

ipdb> 
ipdb> source_blob_client.url
'https://iadatarepos.blob.core.windows.net/iadata/Inbox/Test/compare_all.xlsx?se=4%3A00%3A00&sp=r&sv=2023-11-03&ss=b&srt=sco&sig=an0cyq5YK%2BL0woSLQIUzWUoz9V1GbHHQyJnOQipxDCI%3D'

If I put this URL into Chrome (without the single quotes) I get AuthenticationFailed Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature. RequestId:4f30a97b-801e-003e-75cd-7ac46a000000 Time:2024-03-20T13:53:05.9201944Z Signature fields not well formed.

Your kind assistance requested.

KBD

1

There are 1 answers

8
Bhavani On

You can use below code to copy the file from one directory to another directory in ADLS Gen2 using python:

from azure.storage.blob import BlobServiceClient

def copy_files_to_adls(account_url, sas_token, source_container, source_directory, destination_container, destination_directory):
    blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token) 
    source_container_client = blob_service_client.get_container_client(source_container)
    blobs = source_container_client.list_blobs(name_starts_with=source_directory) 
    for blob in blobs:
        source_blob_path = "<sourcePath>"
        destination_blob_path = "<destinationDirectory>/<fileName>"
        blob_service_client.get_blob_client(destination_container, destination_blob_path).start_copy_from_url(source_blob_path)    
    print("Files copied successfully!")


account_url = 'https://adlsc.blob.core.windows.net'
account_name = '<accountname>'
account_key = "<accountKey>"
source_container = '<sourceContainer>'
source_directory = '<sourceDirectory>'
destination_container = '<destinationContainer>'
destination_directory = '<destinationDirectory>'

copy_files_to_adls(account_url, account_key, source_container, source_directory, destination_container, destination_directory)

enter image description here

The file is copied successfully with same name.

enter image description here